regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #define REG_COMP_C
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 EXTERN_C const struct regexp_engine my_reg_engine;
  81 #else
  82 #  include "regcomp.h"
  83 #endif
  84
  85 #include "dquote_inline.h"
  86 #include "invlist_inline.h"
  87 #include "unicode_constants.h"
  88
  89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  90  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  92  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  95
  96 #ifndef STATIC
  97 #define STATIC  static
  98 #endif
  99
 100 /* this is a chain of data about sub patterns we are processing that
 101    need to be handled separately/specially in study_chunk. Its so
 102    we can simulate recursion without losing state.  */
 103 struct scan_frame;
 104 typedef struct scan_frame {
 105     regnode *last_regnode;      /* last node to process in this frame */
 106     regnode *next_regnode;      /* next node to process when last is reached */
 107     U32 prev_recursed_depth;
 108     I32 stopparen;              /* what stopparen do we use */
 109
 110     struct scan_frame *this_prev_frame; /* this previous frame */
 111     struct scan_frame *prev_frame;      /* previous frame */
 112     struct scan_frame *next_frame;      /* next frame */
 113 } scan_frame;
 114
 115 /* Certain characters are output as a sequence with the first being a
 116  * backslash. */
 117 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
 118
 119
 120 struct RExC_state_t {
 121     U32         flags;                  /* RXf_* are we folding, multilining? */
 122     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 123     char        *precomp;               /* uncompiled string. */
 124     char        *precomp_end;           /* pointer to end of uncompiled string. */
 125     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 126     regexp      *rx;                    /* perl core regexp structure */
 127     regexp_internal     *rxi;           /* internal data for regexp object
 128                                            pprivate field */
 129     char        *start;                 /* Start of input for compile */
 130     char        *end;                   /* End of input for compile */
 131     char        *parse;                 /* Input-scan pointer. */
 132     char        *copy_start;            /* start of copy of input within
 133                                            constructed parse string */
 134     char        *save_copy_start;       /* Provides one level of saving
 135                                            and restoring 'copy_start' */
 136     char        *copy_start_in_input;   /* Position in input string
 137                                            corresponding to copy_start */
 138     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 139     regnode     *emit_start;            /* Start of emitted-code area */
 140     regnode_offset emit;                /* Code-emit pointer */
 141     I32         naughty;                /* How bad is this pattern? */
 142     I32         sawback;                /* Did we see \1, ...? */
 143     U32         seen;
 144     SSize_t     size;                   /* Number of regnode equivalents in
 145                                            pattern */
 146
 147     /* position beyond 'precomp' of the warning message furthest away from
 148      * 'precomp'.  During the parse, no warnings are raised for any problems
 149      * earlier in the parse than this position.  This works if warnings are
 150      * raised the first time a given spot is parsed, and if only one
 151      * independent warning is raised for any given spot */
 152     Size_t      latest_warn_offset;
 153
 154     I32         npar;                   /* Capture buffer count so far in the
 155                                            parse, (OPEN) plus one. ("par" 0 is
 156                                            the whole pattern)*/
 157     I32         total_par;              /* During initial parse, is either 0,
 158                                            or -1; the latter indicating a
 159                                            reparse is needed.  After that pass,
 160                                            it is what 'npar' became after the
 161                                            pass.  Hence, it being > 0 indicates
 162                                            we are in a reparse situation */
 163     I32         nestroot;               /* root parens we are in - used by
 164                                            accept */
 165     I32         seen_zerolen;
 166     regnode_offset *open_parens;        /* offsets to open parens */
 167     regnode_offset *close_parens;       /* offsets to close parens */
 168     I32      parens_buf_size;           /* #slots malloced open/close_parens */
 169     regnode     *end_op;                /* END node in program */
 170     I32         utf8;           /* whether the pattern is utf8 or not */
 171     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 172                                 /* XXX use this for future optimisation of case
 173                                  * where pattern must be upgraded to utf8. */
 174     I32         uni_semantics;  /* If a d charset modifier should use unicode
 175                                    rules, even if the pattern is not in
 176                                    utf8 */
 177     HV          *paren_names;           /* Paren names */
 178
 179     regnode     **recurse;              /* Recurse regops */
 180     I32         recurse_count;          /* Number of recurse regops we have generated */
 181     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 182                                            through */
 183     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 184     I32         in_lookbehind;
 185     I32         in_lookahead;
 186     I32         contains_locale;
 187     I32         override_recoding;
 188     I32         recode_x_to_native;
 189     I32         in_multi_char_class;
 190     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 191                                             within pattern */
 192     int         code_index;             /* next code_blocks[] slot */
 193     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 194     scan_frame *frame_head;
 195     scan_frame *frame_last;
 196     U32         frame_count;
 197     AV         *warn_text;
 198     HV         *unlexed_names;
 199 #ifdef ADD_TO_REGEXEC
 200     char        *starttry;              /* -Dr: where regtry was called. */
 201 #define RExC_starttry   (pRExC_state->starttry)
 202 #endif
 203     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 204 #ifdef DEBUGGING
 205     const char  *lastparse;
 206     I32         lastnum;
 207     AV          *paren_name_list;       /* idx -> name */
 208     U32         study_chunk_recursed_count;
 209     SV          *mysv1;
 210     SV          *mysv2;
 211
 212 #define RExC_lastparse  (pRExC_state->lastparse)
 213 #define RExC_lastnum    (pRExC_state->lastnum)
 214 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 215 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 216 #define RExC_mysv       (pRExC_state->mysv1)
 217 #define RExC_mysv1      (pRExC_state->mysv1)
 218 #define RExC_mysv2      (pRExC_state->mysv2)
 219
 220 #endif
 221     bool        seen_d_op;
 222     bool        strict;
 223     bool        study_started;
 224     bool        in_script_run;
 225     bool        use_BRANCHJ;
 226 };
 227
 228 #define RExC_flags      (pRExC_state->flags)
 229 #define RExC_pm_flags   (pRExC_state->pm_flags)
 230 #define RExC_precomp    (pRExC_state->precomp)
 231 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 232 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 233 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
 234 #define RExC_precomp_end (pRExC_state->precomp_end)
 235 #define RExC_rx_sv      (pRExC_state->rx_sv)
 236 #define RExC_rx         (pRExC_state->rx)
 237 #define RExC_rxi        (pRExC_state->rxi)
 238 #define RExC_start      (pRExC_state->start)
 239 #define RExC_end        (pRExC_state->end)
 240 #define RExC_parse      (pRExC_state->parse)
 241 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 242 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 243 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 244                                                    under /d from /u ? */
 245
 246 #ifdef RE_TRACK_PATTERN_OFFSETS
 247 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 248                                                          others */
 249 #endif
 250 #define RExC_emit       (pRExC_state->emit)
 251 #define RExC_emit_start (pRExC_state->emit_start)
 252 #define RExC_sawback    (pRExC_state->sawback)
 253 #define RExC_seen       (pRExC_state->seen)
 254 #define RExC_size       (pRExC_state->size)
 255 #define RExC_maxlen        (pRExC_state->maxlen)
 256 #define RExC_npar       (pRExC_state->npar)
 257 #define RExC_total_parens       (pRExC_state->total_par)
 258 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
 259 #define RExC_nestroot   (pRExC_state->nestroot)
 260 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 261 #define RExC_utf8       (pRExC_state->utf8)
 262 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 263 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 264 #define RExC_open_parens        (pRExC_state->open_parens)
 265 #define RExC_close_parens       (pRExC_state->close_parens)
 266 #define RExC_end_op     (pRExC_state->end_op)
 267 #define RExC_paren_names        (pRExC_state->paren_names)
 268 #define RExC_recurse    (pRExC_state->recurse)
 269 #define RExC_recurse_count      (pRExC_state->recurse_count)
 270 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 271 #define RExC_study_chunk_recursed_bytes  \
 272                                    (pRExC_state->study_chunk_recursed_bytes)
 273 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 274 #define RExC_in_lookahead       (pRExC_state->in_lookahead)
 275 #define RExC_contains_locale    (pRExC_state->contains_locale)
 276 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 277
 278 #ifdef EBCDIC
 279 #  define SET_recode_x_to_native(x)                                         \
 280                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
 281 #else
 282 #  define SET_recode_x_to_native(x) NOOP
 283 #endif
 284
 285 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 286 #define RExC_frame_head (pRExC_state->frame_head)
 287 #define RExC_frame_last (pRExC_state->frame_last)
 288 #define RExC_frame_count (pRExC_state->frame_count)
 289 #define RExC_strict (pRExC_state->strict)
 290 #define RExC_study_started      (pRExC_state->study_started)
 291 #define RExC_warn_text (pRExC_state->warn_text)
 292 #define RExC_in_script_run      (pRExC_state->in_script_run)
 293 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 294 #define RExC_unlexed_names (pRExC_state->unlexed_names)
 295
 296 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 297  * a flag to disable back-off on the fixed/floating substrings - if it's
 298  * a high complexity pattern we assume the benefit of avoiding a full match
 299  * is worth the cost of checking for the substrings even if they rarely help.
 300  */
 301 #define RExC_naughty    (pRExC_state->naughty)
 302 #define TOO_NAUGHTY (10)
 303 #define MARK_NAUGHTY(add) \
 304     if (RExC_naughty < TOO_NAUGHTY) \
 305         RExC_naughty += (add)
 306 #define MARK_NAUGHTY_EXP(exp, add) \
 307     if (RExC_naughty < TOO_NAUGHTY) \
 308         RExC_naughty += RExC_naughty / (exp) + (add)
 309
 310 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 311 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 312         ((*s) == '{' && regcurly(s)))
 313
 314 /*
 315  * Flags to be passed up and down.
 316  */
 317 #define WORST           0       /* Worst case. */
 318 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 319                                    non-null ones. */
 320
 321 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 322  * character.  (There needs to be a case: in the switch statement in regexec.c
 323  * for any node marked SIMPLE.)  Note that this is not the same thing as
 324  * REGNODE_SIMPLE */
 325 #define SIMPLE          0x02
 326 #define SPSTART         0x04    /* Starts with * or + */
 327 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 328 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 329 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 330 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 331                                    calcuate sizes as UTF-8 */
 332
 333 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 334
 335 /* whether trie related optimizations are enabled */
 336 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 337 #define TRIE_STUDY_OPT
 338 #define FULL_TRIE_STUDY
 339 #define TRIE_STCLASS
 340 #endif
 341
 342
 343
 344 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 345 #define PBITVAL(paren) (1 << ((paren) & 7))
 346 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 347 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 348 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 349
 350 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 351                                      if (!UTF) {                           \
 352                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 353                                          return 0;                         \
 354                                      }                                     \
 355                              } STMT_END
 356
 357 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 358  * a flag that indicates we need to override /d with /u as a result of
 359  * something in the pattern.  It should only be used in regards to calling
 360  * set_regex_charset() or get_regex_charse() */
 361 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 362     STMT_START {                                                            \
 363             if (DEPENDS_SEMANTICS) {                                        \
 364                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 365                 RExC_uni_semantics = 1;                                     \
 366                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
 367                     /* No need to restart the parse if we haven't seen      \
 368                      * anything that differs between /u and /d, and no need \
 369                      * to restart immediately if we're going to reparse     \
 370                      * anyway to count parens */                            \
 371                     *flagp |= RESTART_PARSE;                                \
 372                     return restart_retval;                                  \
 373                 }                                                           \
 374             }                                                               \
 375     } STMT_END
 376
 377 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 378     STMT_START {                                                            \
 379                 RExC_use_BRANCHJ = 1;                                       \
 380                 *flagp |= RESTART_PARSE;                                    \
 381                 return restart_retval;                                      \
 382     } STMT_END
 383
 384 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
 385  * less.  After that, it must always be positive, because the whole re is
 386  * considered to be surrounded by virtual parens.  Setting it to negative
 387  * indicates there is some construct that needs to know the actual number of
 388  * parens to be properly handled.  And that means an extra pass will be
 389  * required after we've counted them all */
 390 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
 391 #define REQUIRE_PARENS_PASS                                                 \
 392     STMT_START {  /* No-op if have completed a pass */                      \
 393                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
 394     } STMT_END
 395 #define IN_PARENS_PASS (RExC_total_parens < 0)
 396
 397
 398 /* This is used to return failure (zero) early from the calling function if
 399  * various flags in 'flags' are set.  Two flags always cause a return:
 400  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 401  * additional flags that should cause a return; 0 if none.  If the return will
 402  * be done, '*flagp' is first set to be all of the flags that caused the
 403  * return. */
 404 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 405     STMT_START {                                                            \
 406             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 407                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 408                 return 0;                                                   \
 409             }                                                               \
 410     } STMT_END
 411
 412 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 413
 414 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 415                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 416 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 417                                     if (MUST_RESTART(*(flagp))) return 0
 418
 419 /* This converts the named class defined in regcomp.h to its equivalent class
 420  * number defined in handy.h. */
 421 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 422 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 423
 424 #define _invlist_union_complement_2nd(a, b, output) \
 425                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 426 #define _invlist_intersection_complement_2nd(a, b, output) \
 427                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 428
 429 /* About scan_data_t.
 430
 431   During optimisation we recurse through the regexp program performing
 432   various inplace (keyhole style) optimisations. In addition study_chunk
 433   and scan_commit populate this data structure with information about
 434   what strings MUST appear in the pattern. We look for the longest
 435   string that must appear at a fixed location, and we look for the
 436   longest string that may appear at a floating location. So for instance
 437   in the pattern:
 438
 439     /FOO[xX]A.*B[xX]BAR/
 440
 441   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 442   strings (because they follow a .* construct). study_chunk will identify
 443   both FOO and BAR as being the longest fixed and floating strings respectively.
 444
 445   The strings can be composites, for instance
 446
 447      /(f)(o)(o)/
 448
 449   will result in a composite fixed substring 'foo'.
 450
 451   For each string some basic information is maintained:
 452
 453   - min_offset
 454     This is the position the string must appear at, or not before.
 455     It also implicitly (when combined with minlenp) tells us how many
 456     characters must match before the string we are searching for.
 457     Likewise when combined with minlenp and the length of the string it
 458     tells us how many characters must appear after the string we have
 459     found.
 460
 461   - max_offset
 462     Only used for floating strings. This is the rightmost point that
 463     the string can appear at. If set to SSize_t_MAX it indicates that the
 464     string can occur infinitely far to the right.
 465     For fixed strings, it is equal to min_offset.
 466
 467   - minlenp
 468     A pointer to the minimum number of characters of the pattern that the
 469     string was found inside. This is important as in the case of positive
 470     lookahead or positive lookbehind we can have multiple patterns
 471     involved. Consider
 472
 473     /(?=FOO).*F/
 474
 475     The minimum length of the pattern overall is 3, the minimum length
 476     of the lookahead part is 3, but the minimum length of the part that
 477     will actually match is 1. So 'FOO's minimum length is 3, but the
 478     minimum length for the F is 1. This is important as the minimum length
 479     is used to determine offsets in front of and behind the string being
 480     looked for.  Since strings can be composites this is the length of the
 481     pattern at the time it was committed with a scan_commit. Note that
 482     the length is calculated by study_chunk, so that the minimum lengths
 483     are not known until the full pattern has been compiled, thus the
 484     pointer to the value.
 485
 486   - lookbehind
 487
 488     In the case of lookbehind the string being searched for can be
 489     offset past the start point of the final matching string.
 490     If this value was just blithely removed from the min_offset it would
 491     invalidate some of the calculations for how many chars must match
 492     before or after (as they are derived from min_offset and minlen and
 493     the length of the string being searched for).
 494     When the final pattern is compiled and the data is moved from the
 495     scan_data_t structure into the regexp structure the information
 496     about lookbehind is factored in, with the information that would
 497     have been lost precalculated in the end_shift field for the
 498     associated string.
 499
 500   The fields pos_min and pos_delta are used to store the minimum offset
 501   and the delta to the maximum offset at the current point in the pattern.
 502
 503 */
 504
 505 struct scan_data_substrs {
 506     SV      *str;       /* longest substring found in pattern */
 507     SSize_t min_offset; /* earliest point in string it can appear */
 508     SSize_t max_offset; /* latest point in string it can appear */
 509     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 510     SSize_t lookbehind; /* is the pos of the string modified by LB */
 511     I32 flags;          /* per substring SF_* and SCF_* flags */
 512 };
 513
 514 typedef struct scan_data_t {
 515     /*I32 len_min;      unused */
 516     /*I32 len_delta;    unused */
 517     SSize_t pos_min;
 518     SSize_t pos_delta;
 519     SV *last_found;
 520     SSize_t last_end;       /* min value, <0 unless valid. */
 521     SSize_t last_start_min;
 522     SSize_t last_start_max;
 523     U8      cur_is_floating; /* whether the last_* values should be set as
 524                               * the next fixed (0) or floating (1)
 525                               * substring */
 526
 527     /* [0] is longest fixed substring so far, [1] is longest float so far */
 528     struct scan_data_substrs  substrs[2];
 529
 530     I32 flags;             /* common SF_* and SCF_* flags */
 531     I32 whilem_c;
 532     SSize_t *last_closep;
 533     regnode_ssc *start_class;
 534 } scan_data_t;
 535
 536 /*
 537  * Forward declarations for pregcomp()'s friends.
 538  */
 539
 540 static const scan_data_t zero_scan_data = {
 541     0, 0, NULL, 0, 0, 0, 0,
 542     {
 543         { NULL, 0, 0, 0, 0, 0 },
 544         { NULL, 0, 0, 0, 0, 0 },
 545     },
 546     0, 0, NULL, NULL
 547 };
 548
 549 /* study flags */
 550
 551 #define SF_BEFORE_SEOL          0x0001
 552 #define SF_BEFORE_MEOL          0x0002
 553 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 554
 555 #define SF_IS_INF               0x0040
 556 #define SF_HAS_PAR              0x0080
 557 #define SF_IN_PAR               0x0100
 558 #define SF_HAS_EVAL             0x0200
 559
 560
 561 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 562  * longest substring in the pattern. When it is not set the optimiser keeps
 563  * track of position, but does not keep track of the actual strings seen,
 564  *
 565  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 566  * /foo/i will not.
 567  *
 568  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 569  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 570  * turned off because of the alternation (BRANCH). */
 571 #define SCF_DO_SUBSTR           0x0400
 572
 573 #define SCF_DO_STCLASS_AND      0x0800
 574 #define SCF_DO_STCLASS_OR       0x1000
 575 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 576 #define SCF_WHILEM_VISITED_POS  0x2000
 577
 578 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 579 #define SCF_SEEN_ACCEPT         0x8000
 580 #define SCF_TRIE_DOING_RESTUDY 0x10000
 581 #define SCF_IN_DEFINE          0x20000
 582
 583
 584
 585
 586 #define UTF cBOOL(RExC_utf8)
 587
 588 /* The enums for all these are ordered so things work out correctly */
 589 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 590 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 591                                                      == REGEX_DEPENDS_CHARSET)
 592 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 593 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 594                                                      >= REGEX_UNICODE_CHARSET)
 595 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 596                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 597 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 598                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 599 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 600                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 601
 602 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 603
 604 /* For programs that want to be strictly Unicode compatible by dying if any
 605  * attempt is made to match a non-Unicode code point against a Unicode
 606  * property.  */
 607 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 608
 609 #define OOB_NAMEDCLASS          -1
 610
 611 /* There is no code point that is out-of-bounds, so this is problematic.  But
 612  * its only current use is to initialize a variable that is always set before
 613  * looked at. */
 614 #define OOB_UNICODE             0xDEADBEEF
 615
 616 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 617
 618
 619 /* length of regex to show in messages that don't mark a position within */
 620 #define RegexLengthToShowInErrorMessages 127
 621
 622 /*
 623  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 624  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 625  * op/pragma/warn/regcomp.
 626  */
 627 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 628 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 629
 630 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 631                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 632
 633 /* The code in this file in places uses one level of recursion with parsing
 634  * rebased to an alternate string constructed by us in memory.  This can take
 635  * the form of something that is completely different from the input, or
 636  * something that uses the input as part of the alternate.  In the first case,
 637  * there should be no possibility of an error, as we are in complete control of
 638  * the alternate string.  But in the second case we don't completely control
 639  * the input portion, so there may be errors in that.  Here's an example:
 640  *      /[abc\x{DF}def]/ui
 641  * is handled specially because \x{df} folds to a sequence of more than one
 642  * character: 'ss'.  What is done is to create and parse an alternate string,
 643  * which looks like this:
 644  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 645  * where it uses the input unchanged in the middle of something it constructs,
 646  * which is a branch for the DF outside the character class, and clustering
 647  * parens around the whole thing. (It knows enough to skip the DF inside the
 648  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 649  * need to be reported.  The general situation looks like this:
 650  *
 651  *                                       |<------- identical ------>|
 652  *              sI                       tI               xI       eI
 653  * Input:       ---------------------------------------------------------------
 654  * Constructed:         ---------------------------------------------------
 655  *                      sC               tC               xC       eC     EC
 656  *                                       |<------- identical ------>|
 657  *
 658  * sI..eI   is the portion of the input pattern we are concerned with here.
 659  * sC..EC   is the constructed substitute parse string.
 660  *  sC..tC  is constructed by us
 661  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 662  *          In the diagram, these are vertically aligned.
 663  *  eC..EC  is also constructed by us.
 664  * xC       is the position in the substitute parse string where we found a
 665  *          problem.
 666  * xI       is the position in the original pattern corresponding to xC.
 667  *
 668  * We want to display a message showing the real input string.  Thus we need to
 669  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 670  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 671  * get:
 672  *      xI = tI + (xC - tC)
 673  *
 674  * When the substitute parse is constructed, the code needs to set:
 675  *      RExC_start (sC)
 676  *      RExC_end (eC)
 677  *      RExC_copy_start_in_input  (tI)
 678  *      RExC_copy_start_in_constructed (tC)
 679  * and restore them when done.
 680  *
 681  * During normal processing of the input pattern, both
 682  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 683  * sI, so that xC equals xI.
 684  */
 685
 686 #define sI              RExC_precomp
 687 #define eI              RExC_precomp_end
 688 #define sC              RExC_start
 689 #define eC              RExC_end
 690 #define tI              RExC_copy_start_in_input
 691 #define tC              RExC_copy_start_in_constructed
 692 #define xI(xC)          (tI + (xC - tC))
 693 #define xI_offset(xC)   (xI(xC) - sI)
 694
 695 #define REPORT_LOCATION_ARGS(xC)                                            \
 696     UTF8fARG(UTF,                                                           \
 697              (xI(xC) > eI) /* Don't run off end */                          \
 698               ? eI - sI   /* Length before the <--HERE */                   \
 699               : ((xI_offset(xC) >= 0)                                       \
 700                  ? xI_offset(xC)                                            \
 701                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 702                                     IVdf " trying to output message for "   \
 703                                     " pattern %.*s",                        \
 704                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 705                                     ((int) (eC - sC)), sC), 0)),            \
 706              sI),         /* The input pattern printed up to the <--HERE */ \
 707     UTF8fARG(UTF,                                                           \
 708              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 709              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 710
 711 /* Used to point after bad bytes for an error message, but avoid skipping
 712  * past a nul byte. */
 713 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 714
 715 /* Set up to clean up after our imminent demise */
 716 #define PREPARE_TO_DIE                                                      \
 717     STMT_START {                                                            \
 718         if (RExC_rx_sv)                                                     \
 719             SAVEFREESV(RExC_rx_sv);                                         \
 720         if (RExC_open_parens)                                               \
 721             SAVEFREEPV(RExC_open_parens);                                   \
 722         if (RExC_close_parens)                                              \
 723             SAVEFREEPV(RExC_close_parens);                                  \
 724     } STMT_END
 725
 726 /*
 727  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 728  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 729  * "...".
 730  */
 731 #define _FAIL(code) STMT_START {                                        \
 732     const char *ellipses = "";                                          \
 733     IV len = RExC_precomp_end - RExC_precomp;                           \
 734                                                                         \
 735     PREPARE_TO_DIE;                                                     \
 736     if (len > RegexLengthToShowInErrorMessages) {                       \
 737         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 738         len = RegexLengthToShowInErrorMessages - 10;                    \
 739         ellipses = "...";                                               \
 740     }                                                                   \
 741     code;                                                               \
 742 } STMT_END
 743
 744 #define FAIL(msg) _FAIL(                            \
 745     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 746             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 747
 748 #define FAIL2(msg,arg) _FAIL(                       \
 749     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 750             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 751
 752 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
 753     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 754      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 755
 756 /*
 757  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 758  */
 759 #define Simple_vFAIL(m) STMT_START {                                    \
 760     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 761             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 762 } STMT_END
 763
 764 /*
 765  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 766  */
 767 #define vFAIL(m) STMT_START {                           \
 768     PREPARE_TO_DIE;                                     \
 769     Simple_vFAIL(m);                                    \
 770 } STMT_END
 771
 772 /*
 773  * Like Simple_vFAIL(), but accepts two arguments.
 774  */
 775 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 776     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 777                       REPORT_LOCATION_ARGS(RExC_parse));        \
 778 } STMT_END
 779
 780 /*
 781  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 782  */
 783 #define vFAIL2(m,a1) STMT_START {                       \
 784     PREPARE_TO_DIE;                                     \
 785     Simple_vFAIL2(m, a1);                               \
 786 } STMT_END
 787
 788
 789 /*
 790  * Like Simple_vFAIL(), but accepts three arguments.
 791  */
 792 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 793     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 794             REPORT_LOCATION_ARGS(RExC_parse));                  \
 795 } STMT_END
 796
 797 /*
 798  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 799  */
 800 #define vFAIL3(m,a1,a2) STMT_START {                    \
 801     PREPARE_TO_DIE;                                     \
 802     Simple_vFAIL3(m, a1, a2);                           \
 803 } STMT_END
 804
 805 /*
 806  * Like Simple_vFAIL(), but accepts four arguments.
 807  */
 808 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 809     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 810             REPORT_LOCATION_ARGS(RExC_parse));                  \
 811 } STMT_END
 812
 813 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 814     PREPARE_TO_DIE;                                     \
 815     Simple_vFAIL4(m, a1, a2, a3);                       \
 816 } STMT_END
 817
 818 /* A specialized version of vFAIL2 that works with UTF8f */
 819 #define vFAIL2utf8f(m, a1) STMT_START {             \
 820     PREPARE_TO_DIE;                                 \
 821     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 822             REPORT_LOCATION_ARGS(RExC_parse));      \
 823 } STMT_END
 824
 825 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 826     PREPARE_TO_DIE;                                     \
 827     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 828             REPORT_LOCATION_ARGS(RExC_parse));          \
 829 } STMT_END
 830
 831 /* Setting this to NULL is a signal to not output warnings */
 832 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
 833     STMT_START {                                                            \
 834       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
 835       RExC_copy_start_in_constructed = NULL;                                \
 836     } STMT_END
 837 #define RESTORE_WARNINGS                                                    \
 838     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
 839
 840 /* Since a warning can be generated multiple times as the input is reparsed, we
 841  * output it the first time we come to that point in the parse, but suppress it
 842  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 843  * generate any warnings */
 844 #define TO_OUTPUT_WARNINGS(loc)                                         \
 845   (   RExC_copy_start_in_constructed                                    \
 846    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 847
 848 /* After we've emitted a warning, we save the position in the input so we don't
 849  * output it again */
 850 #define UPDATE_WARNINGS_LOC(loc)                                        \
 851     STMT_START {                                                        \
 852         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 853             RExC_latest_warn_offset = (xI(loc)) - RExC_precomp;         \
 854         }                                                               \
 855     } STMT_END
 856
 857 /* 'warns' is the output of the packWARNx macro used in 'code' */
 858 #define _WARN_HELPER(loc, warns, code)                                  \
 859     STMT_START {                                                        \
 860         if (! RExC_copy_start_in_constructed) {                         \
 861             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 862                               " expected at '%s'",                      \
 863                               __FILE__, __LINE__, loc);                 \
 864         }                                                               \
 865         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 866             if (ckDEAD(warns))                                          \
 867                 PREPARE_TO_DIE;                                         \
 868             code;                                                       \
 869             UPDATE_WARNINGS_LOC(loc);                                   \
 870         }                                                               \
 871     } STMT_END
 872
 873 /* m is not necessarily a "literal string", in this macro */
 874 #define reg_warn_non_literal_string(loc, m)                             \
 875     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 876                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 877                                        "%s" REPORT_LOCATION,            \
 878                                   m, REPORT_LOCATION_ARGS(loc)))
 879
 880 #define ckWARNreg(loc,m)                                                \
 881     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 882                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 883                                           m REPORT_LOCATION,            \
 884                                           REPORT_LOCATION_ARGS(loc)))
 885
 886 #define vWARN(loc, m)                                                   \
 887     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 888                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 889                                        m REPORT_LOCATION,               \
 890                                        REPORT_LOCATION_ARGS(loc)))      \
 891
 892 #define vWARN_dep(loc, m)                                               \
 893     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 894                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 895                                        m REPORT_LOCATION,               \
 896                                        REPORT_LOCATION_ARGS(loc)))
 897
 898 #define ckWARNdep(loc,m)                                                \
 899     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 900                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 901                                             m REPORT_LOCATION,          \
 902                                             REPORT_LOCATION_ARGS(loc)))
 903
 904 #define ckWARNregdep(loc,m)                                                 \
 905     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
 906                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
 907                                                       WARN_REGEXP),         \
 908                                              m REPORT_LOCATION,             \
 909                                              REPORT_LOCATION_ARGS(loc)))
 910
 911 #define ckWARN2reg_d(loc,m, a1)                                             \
 912     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 913                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
 914                                             m REPORT_LOCATION,              \
 915                                             a1, REPORT_LOCATION_ARGS(loc)))
 916
 917 #define ckWARN2reg(loc, m, a1)                                              \
 918     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 919                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 920                                           m REPORT_LOCATION,                \
 921                                           a1, REPORT_LOCATION_ARGS(loc)))
 922
 923 #define vWARN3(loc, m, a1, a2)                                              \
 924     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 925                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
 926                                        m REPORT_LOCATION,                   \
 927                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
 928
 929 #define ckWARN3reg(loc, m, a1, a2)                                          \
 930     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 931                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 932                                           m REPORT_LOCATION,                \
 933                                           a1, a2,                           \
 934                                           REPORT_LOCATION_ARGS(loc)))
 935
 936 #define vWARN4(loc, m, a1, a2, a3)                                      \
 937     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 938                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 939                                        m REPORT_LOCATION,               \
 940                                        a1, a2, a3,                      \
 941                                        REPORT_LOCATION_ARGS(loc)))
 942
 943 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
 944     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 945                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 946                                           m REPORT_LOCATION,            \
 947                                           a1, a2, a3,                   \
 948                                           REPORT_LOCATION_ARGS(loc)))
 949
 950 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
 951     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 952                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 953                                        m REPORT_LOCATION,               \
 954                                        a1, a2, a3, a4,                  \
 955                                        REPORT_LOCATION_ARGS(loc)))
 956
 957 #define ckWARNexperimental(loc, class, m)                               \
 958     _WARN_HELPER(loc, packWARN(class),                                  \
 959                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
 960                                             m REPORT_LOCATION,          \
 961                                             REPORT_LOCATION_ARGS(loc)))
 962
 963 /* Convert between a pointer to a node and its offset from the beginning of the
 964  * program */
 965 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
 966 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
 967
 968 /* Macros for recording node offsets.   20001227 mjd@plover.com
 969  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 970  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 971  * Element 0 holds the number n.
 972  * Position is 1 indexed.
 973  */
 974 #ifndef RE_TRACK_PATTERN_OFFSETS
 975 #define Set_Node_Offset_To_R(offset,byte)
 976 #define Set_Node_Offset(node,byte)
 977 #define Set_Cur_Node_Offset
 978 #define Set_Node_Length_To_R(node,len)
 979 #define Set_Node_Length(node,len)
 980 #define Set_Node_Cur_Length(node,start)
 981 #define Node_Offset(n)
 982 #define Node_Length(n)
 983 #define Set_Node_Offset_Length(node,offset,len)
 984 #define ProgLen(ri) ri->u.proglen
 985 #define SetProgLen(ri,x) ri->u.proglen = x
 986 #define Track_Code(code)
 987 #else
 988 #define ProgLen(ri) ri->u.offsets[0]
 989 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 990 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
 991         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 992                     __LINE__, (int)(offset), (int)(byte)));             \
 993         if((offset) < 0) {                                              \
 994             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 995                                          (int)(offset));                \
 996         } else {                                                        \
 997             RExC_offsets[2*(offset)-1] = (byte);                        \
 998         }                                                               \
 999 } STMT_END
1000
1001 #define Set_Node_Offset(node,byte)                                      \
1002     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1003 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1004
1005 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
1006         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
1007                 __LINE__, (int)(node), (int)(len)));                    \
1008         if((node) < 0) {                                                \
1009             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1010                                          (int)(node));                  \
1011         } else {                                                        \
1012             RExC_offsets[2*(node)] = (len);                             \
1013         }                                                               \
1014 } STMT_END
1015
1016 #define Set_Node_Length(node,len) \
1017     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1018 #define Set_Node_Cur_Length(node, start)                \
1019     Set_Node_Length(node, RExC_parse - start)
1020
1021 /* Get offsets and lengths */
1022 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1023 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1024
1025 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1026     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1027     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1028 } STMT_END
1029
1030 #define Track_Code(code) STMT_START { code } STMT_END
1031 #endif
1032
1033 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1034 #define EXPERIMENTAL_INPLACESCAN
1035 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1036
1037 #ifdef DEBUGGING
1038 int
1039 Perl_re_printf(pTHX_ const char *fmt, ...)
1040 {
1041     va_list ap;
1042     int result;
1043     PerlIO *f= Perl_debug_log;
1044     PERL_ARGS_ASSERT_RE_PRINTF;
1045     va_start(ap, fmt);
1046     result = PerlIO_vprintf(f, fmt, ap);
1047     va_end(ap);
1048     return result;
1049 }
1050
1051 int
1052 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1053 {
1054     va_list ap;
1055     int result;
1056     PerlIO *f= Perl_debug_log;
1057     PERL_ARGS_ASSERT_RE_INDENTF;
1058     va_start(ap, depth);
1059     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1060     result = PerlIO_vprintf(f, fmt, ap);
1061     va_end(ap);
1062     return result;
1063 }
1064 #endif /* DEBUGGING */
1065
1066 #define DEBUG_RExC_seen()                                                   \
1067         DEBUG_OPTIMISE_MORE_r({                                             \
1068             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1069                                                                             \
1070             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1071                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1072                                                                             \
1073             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1074                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1075                                                                             \
1076             if (RExC_seen & REG_GPOS_SEEN)                                  \
1077                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1078                                                                             \
1079             if (RExC_seen & REG_RECURSE_SEEN)                               \
1080                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1081                                                                             \
1082             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1083                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1084                                                                             \
1085             if (RExC_seen & REG_VERBARG_SEEN)                               \
1086                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1087                                                                             \
1088             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1089                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1090                                                                             \
1091             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1092                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1093                                                                             \
1094             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1095                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1096                                                                             \
1097             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1098                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1099                                                                             \
1100             Perl_re_printf( aTHX_ "\n");                                    \
1101         });
1102
1103 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1104   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1105
1106
1107 #ifdef DEBUGGING
1108 static void
1109 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1110                                     const char *close_str)
1111 {
1112     if (!flags)
1113         return;
1114
1115     Perl_re_printf( aTHX_  "%s", open_str);
1116     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1117     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1118     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1119     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1120     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1121     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1122     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1123     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1124     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1125     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1126     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1127     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1128     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1129     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1130     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1131     Perl_re_printf( aTHX_  "%s", close_str);
1132 }
1133
1134
1135 static void
1136 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1137                     U32 depth, int is_inf)
1138 {
1139     GET_RE_DEBUG_FLAGS_DECL;
1140
1141     DEBUG_OPTIMISE_MORE_r({
1142         if (!data)
1143             return;
1144         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1145             depth,
1146             where,
1147             (IV)data->pos_min,
1148             (IV)data->pos_delta,
1149             (UV)data->flags
1150         );
1151
1152         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1153
1154         Perl_re_printf( aTHX_
1155             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1156             (IV)data->whilem_c,
1157             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1158             is_inf ? "INF " : ""
1159         );
1160
1161         if (data->last_found) {
1162             int i;
1163             Perl_re_printf(aTHX_
1164                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1165                     SvPVX_const(data->last_found),
1166                     (IV)data->last_end,
1167                     (IV)data->last_start_min,
1168                     (IV)data->last_start_max
1169             );
1170
1171             for (i = 0; i < 2; i++) {
1172                 Perl_re_printf(aTHX_
1173                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1174                     data->cur_is_floating == i ? "*" : "",
1175                     i ? "Float" : "Fixed",
1176                     SvPVX_const(data->substrs[i].str),
1177                     (IV)data->substrs[i].min_offset,
1178                     (IV)data->substrs[i].max_offset
1179                 );
1180                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1181             }
1182         }
1183
1184         Perl_re_printf( aTHX_ "\n");
1185     });
1186 }
1187
1188
1189 static void
1190 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1191                 regnode *scan, U32 depth, U32 flags)
1192 {
1193     GET_RE_DEBUG_FLAGS_DECL;
1194
1195     DEBUG_OPTIMISE_r({
1196         regnode *Next;
1197
1198         if (!scan)
1199             return;
1200         Next = regnext(scan);
1201         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1202         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1203             depth,
1204             str,
1205             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1206             Next ? (REG_NODE_NUM(Next)) : 0 );
1207         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1208         Perl_re_printf( aTHX_  "\n");
1209    });
1210 }
1211
1212
1213 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1214                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1215
1216 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1217                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1218
1219 #else
1220 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1221 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1222 #endif
1223
1224
1225 /* =========================================================
1226  * BEGIN edit_distance stuff.
1227  *
1228  * This calculates how many single character changes of any type are needed to
1229  * transform a string into another one.  It is taken from version 3.1 of
1230  *
1231  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1232  */
1233
1234 /* Our unsorted dictionary linked list.   */
1235 /* Note we use UVs, not chars. */
1236
1237 struct dictionary{
1238   UV key;
1239   UV value;
1240   struct dictionary* next;
1241 };
1242 typedef struct dictionary item;
1243
1244
1245 PERL_STATIC_INLINE item*
1246 push(UV key, item* curr)
1247 {
1248     item* head;
1249     Newx(head, 1, item);
1250     head->key = key;
1251     head->value = 0;
1252     head->next = curr;
1253     return head;
1254 }
1255
1256
1257 PERL_STATIC_INLINE item*
1258 find(item* head, UV key)
1259 {
1260     item* iterator = head;
1261     while (iterator){
1262         if (iterator->key == key){
1263             return iterator;
1264         }
1265         iterator = iterator->next;
1266     }
1267
1268     return NULL;
1269 }
1270
1271 PERL_STATIC_INLINE item*
1272 uniquePush(item* head, UV key)
1273 {
1274     item* iterator = head;
1275
1276     while (iterator){
1277         if (iterator->key == key) {
1278             return head;
1279         }
1280         iterator = iterator->next;
1281     }
1282
1283     return push(key, head);
1284 }
1285
1286 PERL_STATIC_INLINE void
1287 dict_free(item* head)
1288 {
1289     item* iterator = head;
1290
1291     while (iterator) {
1292         item* temp = iterator;
1293         iterator = iterator->next;
1294         Safefree(temp);
1295     }
1296
1297     head = NULL;
1298 }
1299
1300 /* End of Dictionary Stuff */
1301
1302 /* All calculations/work are done here */
1303 STATIC int
1304 S_edit_distance(const UV* src,
1305                 const UV* tgt,
1306                 const STRLEN x,             /* length of src[] */
1307                 const STRLEN y,             /* length of tgt[] */
1308                 const SSize_t maxDistance
1309 )
1310 {
1311     item *head = NULL;
1312     UV swapCount, swapScore, targetCharCount, i, j;
1313     UV *scores;
1314     UV score_ceil = x + y;
1315
1316     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1317
1318     /* intialize matrix start values */
1319     Newx(scores, ( (x + 2) * (y + 2)), UV);
1320     scores[0] = score_ceil;
1321     scores[1 * (y + 2) + 0] = score_ceil;
1322     scores[0 * (y + 2) + 1] = score_ceil;
1323     scores[1 * (y + 2) + 1] = 0;
1324     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1325
1326     /* work loops    */
1327     /* i = src index */
1328     /* j = tgt index */
1329     for (i=1;i<=x;i++) {
1330         if (i < x)
1331             head = uniquePush(head, src[i]);
1332         scores[(i+1) * (y + 2) + 1] = i;
1333         scores[(i+1) * (y + 2) + 0] = score_ceil;
1334         swapCount = 0;
1335
1336         for (j=1;j<=y;j++) {
1337             if (i == 1) {
1338                 if(j < y)
1339                 head = uniquePush(head, tgt[j]);
1340                 scores[1 * (y + 2) + (j + 1)] = j;
1341                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1342             }
1343
1344             targetCharCount = find(head, tgt[j-1])->value;
1345             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1346
1347             if (src[i-1] != tgt[j-1]){
1348                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1349             }
1350             else {
1351                 swapCount = j;
1352                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1353             }
1354         }
1355
1356         find(head, src[i-1])->value = i;
1357     }
1358
1359     {
1360         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1361         dict_free(head);
1362         Safefree(scores);
1363         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1364     }
1365 }
1366
1367 /* END of edit_distance() stuff
1368  * ========================================================= */
1369
1370 /* is c a control character for which we have a mnemonic? */
1371 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1372
1373 STATIC const char *
1374 S_cntrl_to_mnemonic(const U8 c)
1375 {
1376     /* Returns the mnemonic string that represents character 'c', if one
1377      * exists; NULL otherwise.  The only ones that exist for the purposes of
1378      * this routine are a few control characters */
1379
1380     switch (c) {
1381         case '\a':       return "\\a";
1382         case '\b':       return "\\b";
1383         case ESC_NATIVE: return "\\e";
1384         case '\f':       return "\\f";
1385         case '\n':       return "\\n";
1386         case '\r':       return "\\r";
1387         case '\t':       return "\\t";
1388     }
1389
1390     return NULL;
1391 }
1392
1393 /* Mark that we cannot extend a found fixed substring at this point.
1394    Update the longest found anchored substring or the longest found
1395    floating substrings if needed. */
1396
1397 STATIC void
1398 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1399                     SSize_t *minlenp, int is_inf)
1400 {
1401     const STRLEN l = CHR_SVLEN(data->last_found);
1402     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1403     const STRLEN old_l = CHR_SVLEN(longest_sv);
1404     GET_RE_DEBUG_FLAGS_DECL;
1405
1406     PERL_ARGS_ASSERT_SCAN_COMMIT;
1407
1408     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1409         const U8 i = data->cur_is_floating;
1410         SvSetMagicSV(longest_sv, data->last_found);
1411         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1412
1413         if (!i) /* fixed */
1414             data->substrs[0].max_offset = data->substrs[0].min_offset;
1415         else { /* float */
1416             data->substrs[1].max_offset = (l
1417                           ? data->last_start_max
1418                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1419                                          ? SSize_t_MAX
1420                                          : data->pos_min + data->pos_delta));
1421             if (is_inf
1422                  || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1423                 data->substrs[1].max_offset = SSize_t_MAX;
1424         }
1425
1426         if (data->flags & SF_BEFORE_EOL)
1427             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1428         else
1429             data->substrs[i].flags &= ~SF_BEFORE_EOL;
1430         data->substrs[i].minlenp = minlenp;
1431         data->substrs[i].lookbehind = 0;
1432     }
1433
1434     SvCUR_set(data->last_found, 0);
1435     {
1436         SV * const sv = data->last_found;
1437         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1438             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1439             if (mg)
1440                 mg->mg_len = 0;
1441         }
1442     }
1443     data->last_end = -1;
1444     data->flags &= ~SF_BEFORE_EOL;
1445     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1446 }
1447
1448 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1449  * list that describes which code points it matches */
1450
1451 STATIC void
1452 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1453 {
1454     /* Set the SSC 'ssc' to match an empty string or any code point */
1455
1456     PERL_ARGS_ASSERT_SSC_ANYTHING;
1457
1458     assert(is_ANYOF_SYNTHETIC(ssc));
1459
1460     /* mortalize so won't leak */
1461     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1462     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1463 }
1464
1465 STATIC int
1466 S_ssc_is_anything(const regnode_ssc *ssc)
1467 {
1468     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1469      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1470      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1471      * in any way, so there's no point in using it */
1472
1473     UV start, end;
1474     bool ret;
1475
1476     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1477
1478     assert(is_ANYOF_SYNTHETIC(ssc));
1479
1480     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1481         return FALSE;
1482     }
1483
1484     /* See if the list consists solely of the range 0 - Infinity */
1485     invlist_iterinit(ssc->invlist);
1486     ret = invlist_iternext(ssc->invlist, &start, &end)
1487           && start == 0
1488           && end == UV_MAX;
1489
1490     invlist_iterfinish(ssc->invlist);
1491
1492     if (ret) {
1493         return TRUE;
1494     }
1495
1496     /* If e.g., both \w and \W are set, matches everything */
1497     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1498         int i;
1499         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1500             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1501                 return TRUE;
1502             }
1503         }
1504     }
1505
1506     return FALSE;
1507 }
1508
1509 STATIC void
1510 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1511 {
1512     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1513      * string, any code point, or any posix class under locale */
1514
1515     PERL_ARGS_ASSERT_SSC_INIT;
1516
1517     Zero(ssc, 1, regnode_ssc);
1518     set_ANYOF_SYNTHETIC(ssc);
1519     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1520     ssc_anything(ssc);
1521
1522     /* If any portion of the regex is to operate under locale rules that aren't
1523      * fully known at compile time, initialization includes it.  The reason
1524      * this isn't done for all regexes is that the optimizer was written under
1525      * the assumption that locale was all-or-nothing.  Given the complexity and
1526      * lack of documentation in the optimizer, and that there are inadequate
1527      * test cases for locale, many parts of it may not work properly, it is
1528      * safest to avoid locale unless necessary. */
1529     if (RExC_contains_locale) {
1530         ANYOF_POSIXL_SETALL(ssc);
1531     }
1532     else {
1533         ANYOF_POSIXL_ZERO(ssc);
1534     }
1535 }
1536
1537 STATIC int
1538 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1539                         const regnode_ssc *ssc)
1540 {
1541     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1542      * to the list of code points matched, and locale posix classes; hence does
1543      * not check its flags) */
1544
1545     UV start, end;
1546     bool ret;
1547
1548     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1549
1550     assert(is_ANYOF_SYNTHETIC(ssc));
1551
1552     invlist_iterinit(ssc->invlist);
1553     ret = invlist_iternext(ssc->invlist, &start, &end)
1554           && start == 0
1555           && end == UV_MAX;
1556
1557     invlist_iterfinish(ssc->invlist);
1558
1559     if (! ret) {
1560         return FALSE;
1561     }
1562
1563     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1564         return FALSE;
1565     }
1566
1567     return TRUE;
1568 }
1569
1570 #define INVLIST_INDEX 0
1571 #define ONLY_LOCALE_MATCHES_INDEX 1
1572 #define DEFERRED_USER_DEFINED_INDEX 2
1573
1574 STATIC SV*
1575 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1576                                const regnode_charclass* const node)
1577 {
1578     /* Returns a mortal inversion list defining which code points are matched
1579      * by 'node', which is of type ANYOF.  Handles complementing the result if
1580      * appropriate.  If some code points aren't knowable at this time, the
1581      * returned list must, and will, contain every code point that is a
1582      * possibility. */
1583
1584     dVAR;
1585     SV* invlist = NULL;
1586     SV* only_utf8_locale_invlist = NULL;
1587     unsigned int i;
1588     const U32 n = ARG(node);
1589     bool new_node_has_latin1 = FALSE;
1590     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
1591                       ? 0
1592                       : ANYOF_FLAGS(node);
1593
1594     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1595
1596     /* Look at the data structure created by S_set_ANYOF_arg() */
1597     if (n != ANYOF_ONLY_HAS_BITMAP) {
1598         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1599         AV * const av = MUTABLE_AV(SvRV(rv));
1600         SV **const ary = AvARRAY(av);
1601         assert(RExC_rxi->data->what[n] == 's');
1602
1603         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1604
1605             /* Here there are things that won't be known until runtime -- we
1606              * have to assume it could be anything */
1607             invlist = sv_2mortal(_new_invlist(1));
1608             return _add_range_to_invlist(invlist, 0, UV_MAX);
1609         }
1610         else if (ary[INVLIST_INDEX]) {
1611
1612             /* Use the node's inversion list */
1613             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1614         }
1615
1616         /* Get the code points valid only under UTF-8 locales */
1617         if (   (flags & ANYOFL_FOLD)
1618             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1619         {
1620             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1621         }
1622     }
1623
1624     if (! invlist) {
1625         invlist = sv_2mortal(_new_invlist(0));
1626     }
1627
1628     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1629      * code points, and an inversion list for the others, but if there are code
1630      * points that should match only conditionally on the target string being
1631      * UTF-8, those are placed in the inversion list, and not the bitmap.
1632      * Since there are circumstances under which they could match, they are
1633      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1634      * to exclude them here, so that when we invert below, the end result
1635      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1636      * have to do this here before we add the unconditionally matched code
1637      * points */
1638     if (flags & ANYOF_INVERT) {
1639         _invlist_intersection_complement_2nd(invlist,
1640                                              PL_UpperLatin1,
1641                                              &invlist);
1642     }
1643
1644     /* Add in the points from the bit map */
1645     if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
1646         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1647             if (ANYOF_BITMAP_TEST(node, i)) {
1648                 unsigned int start = i++;
1649
1650                 for (;    i < NUM_ANYOF_CODE_POINTS
1651                        && ANYOF_BITMAP_TEST(node, i); ++i)
1652                 {
1653                     /* empty */
1654                 }
1655                 invlist = _add_range_to_invlist(invlist, start, i-1);
1656                 new_node_has_latin1 = TRUE;
1657             }
1658         }
1659     }
1660
1661     /* If this can match all upper Latin1 code points, have to add them
1662      * as well.  But don't add them if inverting, as when that gets done below,
1663      * it would exclude all these characters, including the ones it shouldn't
1664      * that were added just above */
1665     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1666         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1667     {
1668         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1669     }
1670
1671     /* Similarly for these */
1672     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1673         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1674     }
1675
1676     if (flags & ANYOF_INVERT) {
1677         _invlist_invert(invlist);
1678     }
1679     else if (flags & ANYOFL_FOLD) {
1680         if (new_node_has_latin1) {
1681
1682             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1683              * the locale.  We can skip this if there are no 0-255 at all. */
1684             _invlist_union(invlist, PL_Latin1, &invlist);
1685
1686             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1687             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1688         }
1689         else {
1690             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1691                 invlist = add_cp_to_invlist(invlist, 'I');
1692             }
1693             if (_invlist_contains_cp(invlist,
1694                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1695             {
1696                 invlist = add_cp_to_invlist(invlist, 'i');
1697             }
1698         }
1699     }
1700
1701     /* Similarly add the UTF-8 locale possible matches.  These have to be
1702      * deferred until after the non-UTF-8 locale ones are taken care of just
1703      * above, or it leads to wrong results under ANYOF_INVERT */
1704     if (only_utf8_locale_invlist) {
1705         _invlist_union_maybe_complement_2nd(invlist,
1706                                             only_utf8_locale_invlist,
1707                                             flags & ANYOF_INVERT,
1708                                             &invlist);
1709     }
1710
1711     return invlist;
1712 }
1713
1714 /* These two functions currently do the exact same thing */
1715 #define ssc_init_zero           ssc_init
1716
1717 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1718 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1719
1720 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1721  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1722  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1723
1724 STATIC void
1725 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1726                 const regnode_charclass *and_with)
1727 {
1728     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1729      * another SSC or a regular ANYOF class.  Can create false positives. */
1730
1731     SV* anded_cp_list;
1732     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
1733                           ? 0
1734                           : ANYOF_FLAGS(and_with);
1735     U8  anded_flags;
1736
1737     PERL_ARGS_ASSERT_SSC_AND;
1738
1739     assert(is_ANYOF_SYNTHETIC(ssc));
1740
1741     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1742      * the code point inversion list and just the relevant flags */
1743     if (is_ANYOF_SYNTHETIC(and_with)) {
1744         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1745         anded_flags = and_with_flags;
1746
1747         /* XXX This is a kludge around what appears to be deficiencies in the
1748          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1749          * there are paths through the optimizer where it doesn't get weeded
1750          * out when it should.  And if we don't make some extra provision for
1751          * it like the code just below, it doesn't get added when it should.
1752          * This solution is to add it only when AND'ing, which is here, and
1753          * only when what is being AND'ed is the pristine, original node
1754          * matching anything.  Thus it is like adding it to ssc_anything() but
1755          * only when the result is to be AND'ed.  Probably the same solution
1756          * could be adopted for the same problem we have with /l matching,
1757          * which is solved differently in S_ssc_init(), and that would lead to
1758          * fewer false positives than that solution has.  But if this solution
1759          * creates bugs, the consequences are only that a warning isn't raised
1760          * that should be; while the consequences for having /l bugs is
1761          * incorrect matches */
1762         if (ssc_is_anything((regnode_ssc *)and_with)) {
1763             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1764         }
1765     }
1766     else {
1767         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1768         if (OP(and_with) == ANYOFD) {
1769             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1770         }
1771         else {
1772             anded_flags = and_with_flags
1773             &( ANYOF_COMMON_FLAGS
1774               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1775               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1776             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1777                 anded_flags &=
1778                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1779             }
1780         }
1781     }
1782
1783     ANYOF_FLAGS(ssc) &= anded_flags;
1784
1785     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1786      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1787      * 'and_with' may be inverted.  When not inverted, we have the situation of
1788      * computing:
1789      *  (C1 | P1) & (C2 | P2)
1790      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1791      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1792      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1793      *                    <=  ((C1 & C2) | P1 | P2)
1794      * Alternatively, the last few steps could be:
1795      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1796      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1797      *                    <=  (C1 | C2 | (P1 & P2))
1798      * We favor the second approach if either P1 or P2 is non-empty.  This is
1799      * because these components are a barrier to doing optimizations, as what
1800      * they match cannot be known until the moment of matching as they are
1801      * dependent on the current locale, 'AND"ing them likely will reduce or
1802      * eliminate them.
1803      * But we can do better if we know that C1,P1 are in their initial state (a
1804      * frequent occurrence), each matching everything:
1805      *  (<everything>) & (C2 | P2) =  C2 | P2
1806      * Similarly, if C2,P2 are in their initial state (again a frequent
1807      * occurrence), the result is a no-op
1808      *  (C1 | P1) & (<everything>) =  C1 | P1
1809      *
1810      * Inverted, we have
1811      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1812      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1813      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1814      * */
1815
1816     if ((and_with_flags & ANYOF_INVERT)
1817         && ! is_ANYOF_SYNTHETIC(and_with))
1818     {
1819         unsigned int i;
1820
1821         ssc_intersection(ssc,
1822                          anded_cp_list,
1823                          FALSE /* Has already been inverted */
1824                          );
1825
1826         /* If either P1 or P2 is empty, the intersection will be also; can skip
1827          * the loop */
1828         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1829             ANYOF_POSIXL_ZERO(ssc);
1830         }
1831         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1832
1833             /* Note that the Posix class component P from 'and_with' actually
1834              * looks like:
1835              *      P = Pa | Pb | ... | Pn
1836              * where each component is one posix class, such as in [\w\s].
1837              * Thus
1838              *      ~P = ~(Pa | Pb | ... | Pn)
1839              *         = ~Pa & ~Pb & ... & ~Pn
1840              *        <= ~Pa | ~Pb | ... | ~Pn
1841              * The last is something we can easily calculate, but unfortunately
1842              * is likely to have many false positives.  We could do better
1843              * in some (but certainly not all) instances if two classes in
1844              * P have known relationships.  For example
1845              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1846              * So
1847              *      :lower: & :print: = :lower:
1848              * And similarly for classes that must be disjoint.  For example,
1849              * since \s and \w can have no elements in common based on rules in
1850              * the POSIX standard,
1851              *      \w & ^\S = nothing
1852              * Unfortunately, some vendor locales do not meet the Posix
1853              * standard, in particular almost everything by Microsoft.
1854              * The loop below just changes e.g., \w into \W and vice versa */
1855
1856             regnode_charclass_posixl temp;
1857             int add = 1;    /* To calculate the index of the complement */
1858
1859             Zero(&temp, 1, regnode_charclass_posixl);
1860             ANYOF_POSIXL_ZERO(&temp);
1861             for (i = 0; i < ANYOF_MAX; i++) {
1862                 assert(i % 2 != 0
1863                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1864                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1865
1866                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1867                     ANYOF_POSIXL_SET(&temp, i + add);
1868                 }
1869                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1870             }
1871             ANYOF_POSIXL_AND(&temp, ssc);
1872
1873         } /* else ssc already has no posixes */
1874     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1875          in its initial state */
1876     else if (! is_ANYOF_SYNTHETIC(and_with)
1877              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1878     {
1879         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1880          * copy it over 'ssc' */
1881         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1882             if (is_ANYOF_SYNTHETIC(and_with)) {
1883                 StructCopy(and_with, ssc, regnode_ssc);
1884             }
1885             else {
1886                 ssc->invlist = anded_cp_list;
1887                 ANYOF_POSIXL_ZERO(ssc);
1888                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1889                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1890                 }
1891             }
1892         }
1893         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1894                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1895         {
1896             /* One or the other of P1, P2 is non-empty. */
1897             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1898                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1899             }
1900             ssc_union(ssc, anded_cp_list, FALSE);
1901         }
1902         else { /* P1 = P2 = empty */
1903             ssc_intersection(ssc, anded_cp_list, FALSE);
1904         }
1905     }
1906 }
1907
1908 STATIC void
1909 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1910                const regnode_charclass *or_with)
1911 {
1912     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1913      * another SSC or a regular ANYOF class.  Can create false positives if
1914      * 'or_with' is to be inverted. */
1915
1916     SV* ored_cp_list;
1917     U8 ored_flags;
1918     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
1919                          ? 0
1920                          : ANYOF_FLAGS(or_with);
1921
1922     PERL_ARGS_ASSERT_SSC_OR;
1923
1924     assert(is_ANYOF_SYNTHETIC(ssc));
1925
1926     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1927      * the code point inversion list and just the relevant flags */
1928     if (is_ANYOF_SYNTHETIC(or_with)) {
1929         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1930         ored_flags = or_with_flags;
1931     }
1932     else {
1933         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1934         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
1935         if (OP(or_with) != ANYOFD) {
1936             ored_flags
1937             |= or_with_flags
1938              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1939                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1940             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
1941                 ored_flags |=
1942                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1943             }
1944         }
1945     }
1946
1947     ANYOF_FLAGS(ssc) |= ored_flags;
1948
1949     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1950      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1951      * 'or_with' may be inverted.  When not inverted, we have the simple
1952      * situation of computing:
1953      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1954      * If P1|P2 yields a situation with both a class and its complement are
1955      * set, like having both \w and \W, this matches all code points, and we
1956      * can delete these from the P component of the ssc going forward.  XXX We
1957      * might be able to delete all the P components, but I (khw) am not certain
1958      * about this, and it is better to be safe.
1959      *
1960      * Inverted, we have
1961      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1962      *                         <=  (C1 | P1) | ~C2
1963      *                         <=  (C1 | ~C2) | P1
1964      * (which results in actually simpler code than the non-inverted case)
1965      * */
1966
1967     if ((or_with_flags & ANYOF_INVERT)
1968         && ! is_ANYOF_SYNTHETIC(or_with))
1969     {
1970         /* We ignore P2, leaving P1 going forward */
1971     }   /* else  Not inverted */
1972     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
1973         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1974         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1975             unsigned int i;
1976             for (i = 0; i < ANYOF_MAX; i += 2) {
1977                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1978                 {
1979                     ssc_match_all_cp(ssc);
1980                     ANYOF_POSIXL_CLEAR(ssc, i);
1981                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1982                 }
1983             }
1984         }
1985     }
1986
1987     ssc_union(ssc,
1988               ored_cp_list,
1989               FALSE /* Already has been inverted */
1990               );
1991 }
1992
1993 PERL_STATIC_INLINE void
1994 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1995 {
1996     PERL_ARGS_ASSERT_SSC_UNION;
1997
1998     assert(is_ANYOF_SYNTHETIC(ssc));
1999
2000     _invlist_union_maybe_complement_2nd(ssc->invlist,
2001                                         invlist,
2002                                         invert2nd,
2003                                         &ssc->invlist);
2004 }
2005
2006 PERL_STATIC_INLINE void
2007 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2008                          SV* const invlist,
2009                          const bool invert2nd)
2010 {
2011     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2012
2013     assert(is_ANYOF_SYNTHETIC(ssc));
2014
2015     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2016                                                invlist,
2017                                                invert2nd,
2018                                                &ssc->invlist);
2019 }
2020
2021 PERL_STATIC_INLINE void
2022 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2023 {
2024     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2025
2026     assert(is_ANYOF_SYNTHETIC(ssc));
2027
2028     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2029 }
2030
2031 PERL_STATIC_INLINE void
2032 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2033 {
2034     /* AND just the single code point 'cp' into the SSC 'ssc' */
2035
2036     SV* cp_list = _new_invlist(2);
2037
2038     PERL_ARGS_ASSERT_SSC_CP_AND;
2039
2040     assert(is_ANYOF_SYNTHETIC(ssc));
2041
2042     cp_list = add_cp_to_invlist(cp_list, cp);
2043     ssc_intersection(ssc, cp_list,
2044                      FALSE /* Not inverted */
2045                      );
2046     SvREFCNT_dec_NN(cp_list);
2047 }
2048
2049 PERL_STATIC_INLINE void
2050 S_ssc_clear_locale(regnode_ssc *ssc)
2051 {
2052     /* Set the SSC 'ssc' to not match any locale things */
2053     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2054
2055     assert(is_ANYOF_SYNTHETIC(ssc));
2056
2057     ANYOF_POSIXL_ZERO(ssc);
2058     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2059 }
2060
2061 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2062
2063 STATIC bool
2064 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2065 {
2066     /* The synthetic start class is used to hopefully quickly winnow down
2067      * places where a pattern could start a match in the target string.  If it
2068      * doesn't really narrow things down that much, there isn't much point to
2069      * having the overhead of using it.  This function uses some very crude
2070      * heuristics to decide if to use the ssc or not.
2071      *
2072      * It returns TRUE if 'ssc' rules out more than half what it considers to
2073      * be the "likely" possible matches, but of course it doesn't know what the
2074      * actual things being matched are going to be; these are only guesses
2075      *
2076      * For /l matches, it assumes that the only likely matches are going to be
2077      *      in the 0-255 range, uniformly distributed, so half of that is 127
2078      * For /a and /d matches, it assumes that the likely matches will be just
2079      *      the ASCII range, so half of that is 63
2080      * For /u and there isn't anything matching above the Latin1 range, it
2081      *      assumes that that is the only range likely to be matched, and uses
2082      *      half that as the cut-off: 127.  If anything matches above Latin1,
2083      *      it assumes that all of Unicode could match (uniformly), except for
2084      *      non-Unicode code points and things in the General Category "Other"
2085      *      (unassigned, private use, surrogates, controls and formats).  This
2086      *      is a much large number. */
2087
2088     U32 count = 0;      /* Running total of number of code points matched by
2089                            'ssc' */
2090     UV start, end;      /* Start and end points of current range in inversion
2091                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2092     const U32 max_code_points = (LOC)
2093                                 ?  256
2094                                 : ((  ! UNI_SEMANTICS
2095                                     ||  invlist_highest(ssc->invlist) < 256)
2096                                   ? 128
2097                                   : NON_OTHER_COUNT);
2098     const U32 max_match = max_code_points / 2;
2099
2100     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2101
2102     invlist_iterinit(ssc->invlist);
2103     while (invlist_iternext(ssc->invlist, &start, &end)) {
2104         if (start >= max_code_points) {
2105             break;
2106         }
2107         end = MIN(end, max_code_points - 1);
2108         count += end - start + 1;
2109         if (count >= max_match) {
2110             invlist_iterfinish(ssc->invlist);
2111             return FALSE;
2112         }
2113     }
2114
2115     return TRUE;
2116 }
2117
2118
2119 STATIC void
2120 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2121 {
2122     /* The inversion list in the SSC is marked mortal; now we need a more
2123      * permanent copy, which is stored the same way that is done in a regular
2124      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2125      * map */
2126
2127     SV* invlist = invlist_clone(ssc->invlist, NULL);
2128
2129     PERL_ARGS_ASSERT_SSC_FINALIZE;
2130
2131     assert(is_ANYOF_SYNTHETIC(ssc));
2132
2133     /* The code in this file assumes that all but these flags aren't relevant
2134      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2135      * by the time we reach here */
2136     assert(! (ANYOF_FLAGS(ssc)
2137         & ~( ANYOF_COMMON_FLAGS
2138             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2139             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2140
2141     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2142
2143     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2144
2145     /* Make sure is clone-safe */
2146     ssc->invlist = NULL;
2147
2148     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2149         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2150         OP(ssc) = ANYOFPOSIXL;
2151     }
2152     else if (RExC_contains_locale) {
2153         OP(ssc) = ANYOFL;
2154     }
2155
2156     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2157 }
2158
2159 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2160 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2161 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2162 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2163                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2164                                : 0 )
2165
2166
2167 #ifdef DEBUGGING
2168 /*
2169    dump_trie(trie,widecharmap,revcharmap)
2170    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2171    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2172
2173    These routines dump out a trie in a somewhat readable format.
2174    The _interim_ variants are used for debugging the interim
2175    tables that are used to generate the final compressed
2176    representation which is what dump_trie expects.
2177
2178    Part of the reason for their existence is to provide a form
2179    of documentation as to how the different representations function.
2180
2181 */
2182
2183 /*
2184   Dumps the final compressed table form of the trie to Perl_debug_log.
2185   Used for debugging make_trie().
2186 */
2187
2188 STATIC void
2189 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2190             AV *revcharmap, U32 depth)
2191 {
2192     U32 state;
2193     SV *sv=sv_newmortal();
2194     int colwidth= widecharmap ? 6 : 4;
2195     U16 word;
2196     GET_RE_DEBUG_FLAGS_DECL;
2197
2198     PERL_ARGS_ASSERT_DUMP_TRIE;
2199
2200     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2201         depth+1, "Match","Base","Ofs" );
2202
2203     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2204         SV ** const tmp = av_fetch( revcharmap, state, 0);
2205         if ( tmp ) {
2206             Perl_re_printf( aTHX_  "%*s",
2207                 colwidth,
2208                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2209                             PL_colors[0], PL_colors[1],
2210                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2211                             PERL_PV_ESCAPE_FIRSTCHAR
2212                 )
2213             );
2214         }
2215     }
2216     Perl_re_printf( aTHX_  "\n");
2217     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2218
2219     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2220         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2221     Perl_re_printf( aTHX_  "\n");
2222
2223     for( state = 1 ; state < trie->statecount ; state++ ) {
2224         const U32 base = trie->states[ state ].trans.base;
2225
2226         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2227
2228         if ( trie->states[ state ].wordnum ) {
2229             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2230         } else {
2231             Perl_re_printf( aTHX_  "%6s", "" );
2232         }
2233
2234         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2235
2236         if ( base ) {
2237             U32 ofs = 0;
2238
2239             while( ( base + ofs  < trie->uniquecharcount ) ||
2240                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2241                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2242                                                                     != state))
2243                     ofs++;
2244
2245             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2246
2247             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2248                 if ( ( base + ofs >= trie->uniquecharcount )
2249                         && ( base + ofs - trie->uniquecharcount
2250                                                         < trie->lasttrans )
2251                         && trie->trans[ base + ofs
2252                                     - trie->uniquecharcount ].check == state )
2253                 {
2254                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2255                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2256                    );
2257                 } else {
2258                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2259                 }
2260             }
2261
2262             Perl_re_printf( aTHX_  "]");
2263
2264         }
2265         Perl_re_printf( aTHX_  "\n" );
2266     }
2267     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2268                                 depth);
2269     for (word=1; word <= trie->wordcount; word++) {
2270         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2271             (int)word, (int)(trie->wordinfo[word].prev),
2272             (int)(trie->wordinfo[word].len));
2273     }
2274     Perl_re_printf( aTHX_  "\n" );
2275 }
2276 /*
2277   Dumps a fully constructed but uncompressed trie in list form.
2278   List tries normally only are used for construction when the number of
2279   possible chars (trie->uniquecharcount) is very high.
2280   Used for debugging make_trie().
2281 */
2282 STATIC void
2283 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2284                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2285                          U32 depth)
2286 {
2287     U32 state;
2288     SV *sv=sv_newmortal();
2289     int colwidth= widecharmap ? 6 : 4;
2290     GET_RE_DEBUG_FLAGS_DECL;
2291
2292     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2293
2294     /* print out the table precompression.  */
2295     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2296             depth+1 );
2297     Perl_re_indentf( aTHX_  "%s",
2298             depth+1, "------:-----+-----------------\n" );
2299
2300     for( state=1 ; state < next_alloc ; state ++ ) {
2301         U16 charid;
2302
2303         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2304             depth+1, (UV)state  );
2305         if ( ! trie->states[ state ].wordnum ) {
2306             Perl_re_printf( aTHX_  "%5s| ","");
2307         } else {
2308             Perl_re_printf( aTHX_  "W%4x| ",
2309                 trie->states[ state ].wordnum
2310             );
2311         }
2312         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2313             SV ** const tmp = av_fetch( revcharmap,
2314                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2315             if ( tmp ) {
2316                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2317                     colwidth,
2318                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2319                               colwidth,
2320                               PL_colors[0], PL_colors[1],
2321                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2322                               | PERL_PV_ESCAPE_FIRSTCHAR
2323                     ) ,
2324                     TRIE_LIST_ITEM(state, charid).forid,
2325                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2326                 );
2327                 if (!(charid % 10))
2328                     Perl_re_printf( aTHX_  "\n%*s| ",
2329                         (int)((depth * 2) + 14), "");
2330             }
2331         }
2332         Perl_re_printf( aTHX_  "\n");
2333     }
2334 }
2335
2336 /*
2337   Dumps a fully constructed but uncompressed trie in table form.
2338   This is the normal DFA style state transition table, with a few
2339   twists to facilitate compression later.
2340   Used for debugging make_trie().
2341 */
2342 STATIC void
2343 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2344                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2345                           U32 depth)
2346 {
2347     U32 state;
2348     U16 charid;
2349     SV *sv=sv_newmortal();
2350     int colwidth= widecharmap ? 6 : 4;
2351     GET_RE_DEBUG_FLAGS_DECL;
2352
2353     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2354
2355     /*
2356        print out the table precompression so that we can do a visual check
2357        that they are identical.
2358      */
2359
2360     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2361
2362     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2363         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2364         if ( tmp ) {
2365             Perl_re_printf( aTHX_  "%*s",
2366                 colwidth,
2367                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2368                             PL_colors[0], PL_colors[1],
2369                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2370                             PERL_PV_ESCAPE_FIRSTCHAR
2371                 )
2372             );
2373         }
2374     }
2375
2376     Perl_re_printf( aTHX_ "\n");
2377     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2378
2379     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2380         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2381     }
2382
2383     Perl_re_printf( aTHX_  "\n" );
2384
2385     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2386
2387         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2388             depth+1,
2389             (UV)TRIE_NODENUM( state ) );
2390
2391         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2392             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2393             if (v)
2394                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2395             else
2396                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2397         }
2398         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2399             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2400                                             (UV)trie->trans[ state ].check );
2401         } else {
2402             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2403                                             (UV)trie->trans[ state ].check,
2404             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2405         }
2406     }
2407 }
2408
2409 #endif
2410
2411
2412 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2413   startbranch: the first branch in the whole branch sequence
2414   first      : start branch of sequence of branch-exact nodes.
2415                May be the same as startbranch
2416   last       : Thing following the last branch.
2417                May be the same as tail.
2418   tail       : item following the branch sequence
2419   count      : words in the sequence
2420   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2421   depth      : indent depth
2422
2423 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2424
2425 A trie is an N'ary tree where the branches are determined by digital
2426 decomposition of the key. IE, at the root node you look up the 1st character and
2427 follow that branch repeat until you find the end of the branches. Nodes can be
2428 marked as "accepting" meaning they represent a complete word. Eg:
2429
2430   /he|she|his|hers/
2431
2432 would convert into the following structure. Numbers represent states, letters
2433 following numbers represent valid transitions on the letter from that state, if
2434 the number is in square brackets it represents an accepting state, otherwise it
2435 will be in parenthesis.
2436
2437       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2438       |    |
2439       |   (2)
2440       |    |
2441      (1)   +-i->(6)-+-s->[7]
2442       |
2443       +-s->(3)-+-h->(4)-+-e->[5]
2444
2445       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2446
2447 This shows that when matching against the string 'hers' we will begin at state 1
2448 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2449 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2450 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2451 single traverse. We store a mapping from accepting to state to which word was
2452 matched, and then when we have multiple possibilities we try to complete the
2453 rest of the regex in the order in which they occurred in the alternation.
2454
2455 The only prior NFA like behaviour that would be changed by the TRIE support is
2456 the silent ignoring of duplicate alternations which are of the form:
2457
2458  / (DUPE|DUPE) X? (?{ ... }) Y /x
2459
2460 Thus EVAL blocks following a trie may be called a different number of times with
2461 and without the optimisation. With the optimisations dupes will be silently
2462 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2463 the following demonstrates:
2464
2465  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2466
2467 which prints out 'word' three times, but
2468
2469  'words'=~/(word|word|word)(?{ print $1 })S/
2470
2471 which doesnt print it out at all. This is due to other optimisations kicking in.
2472
2473 Example of what happens on a structural level:
2474
2475 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2476
2477    1: CURLYM[1] {1,32767}(18)
2478    5:   BRANCH(8)
2479    6:     EXACT <ac>(16)
2480    8:   BRANCH(11)
2481    9:     EXACT <ad>(16)
2482   11:   BRANCH(14)
2483   12:     EXACT <ab>(16)
2484   16:   SUCCEED(0)
2485   17:   NOTHING(18)
2486   18: END(0)
2487
2488 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2489 and should turn into:
2490
2491    1: CURLYM[1] {1,32767}(18)
2492    5:   TRIE(16)
2493         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2494           <ac>
2495           <ad>
2496           <ab>
2497   16:   SUCCEED(0)
2498   17:   NOTHING(18)
2499   18: END(0)
2500
2501 Cases where tail != last would be like /(?foo|bar)baz/:
2502
2503    1: BRANCH(4)
2504    2:   EXACT <foo>(8)
2505    4: BRANCH(7)
2506    5:   EXACT <bar>(8)
2507    7: TAIL(8)
2508    8: EXACT <baz>(10)
2509   10: END(0)
2510
2511 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2512 and would end up looking like:
2513
2514     1: TRIE(8)
2515       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2516         <foo>
2517         <bar>
2518    7: TAIL(8)
2519    8: EXACT <baz>(10)
2520   10: END(0)
2521
2522     d = uvchr_to_utf8_flags(d, uv, 0);
2523
2524 is the recommended Unicode-aware way of saying
2525
2526     *(d++) = uv;
2527 */
2528
2529 #define TRIE_STORE_REVCHAR(val)                                            \
2530     STMT_START {                                                           \
2531         if (UTF) {                                                         \
2532             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2533             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2534             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2535             *kapow = '\0';                                                 \
2536             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2537             SvPOK_on(zlopp);                                               \
2538             SvUTF8_on(zlopp);                                              \
2539             av_push(revcharmap, zlopp);                                    \
2540         } else {                                                           \
2541             char ooooff = (char)val;                                           \
2542             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2543         }                                                                  \
2544         } STMT_END
2545
2546 /* This gets the next character from the input, folding it if not already
2547  * folded. */
2548 #define TRIE_READ_CHAR STMT_START {                                           \
2549     wordlen++;                                                                \
2550     if ( UTF ) {                                                              \
2551         /* if it is UTF then it is either already folded, or does not need    \
2552          * folding */                                                         \
2553         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2554     }                                                                         \
2555     else if (folder == PL_fold_latin1) {                                      \
2556         /* This folder implies Unicode rules, which in the range expressible  \
2557          *  by not UTF is the lower case, with the two exceptions, one of     \
2558          *  which should have been taken care of before calling this */       \
2559         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2560         uvc = toLOWER_L1(*uc);                                                \
2561         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2562         len = 1;                                                              \
2563     } else {                                                                  \
2564         /* raw data, will be folded later if needed */                        \
2565         uvc = (U32)*uc;                                                       \
2566         len = 1;                                                              \
2567     }                                                                         \
2568 } STMT_END
2569
2570
2571
2572 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2573     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2574         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2575         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2576         TRIE_LIST_LEN( state ) = ging;                          \
2577     }                                                           \
2578     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2579     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2580     TRIE_LIST_CUR( state )++;                                   \
2581 } STMT_END
2582
2583 #define TRIE_LIST_NEW(state) STMT_START {                       \
2584     Newx( trie->states[ state ].trans.list,                     \
2585         4, reg_trie_trans_le );                                 \
2586      TRIE_LIST_CUR( state ) = 1;                                \
2587      TRIE_LIST_LEN( state ) = 4;                                \
2588 } STMT_END
2589
2590 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2591     U16 dupe= trie->states[ state ].wordnum;                    \
2592     regnode * const noper_next = regnext( noper );              \
2593                                                                 \
2594     DEBUG_r({                                                   \
2595         /* store the word for dumping */                        \
2596         SV* tmp;                                                \
2597         if (OP(noper) != NOTHING)                               \
2598             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2599         else                                                    \
2600             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2601         av_push( trie_words, tmp );                             \
2602     });                                                         \
2603                                                                 \
2604     curword++;                                                  \
2605     trie->wordinfo[curword].prev   = 0;                         \
2606     trie->wordinfo[curword].len    = wordlen;                   \
2607     trie->wordinfo[curword].accept = state;                     \
2608                                                                 \
2609     if ( noper_next < tail ) {                                  \
2610         if (!trie->jump)                                        \
2611             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2612                                                  sizeof(U16) ); \
2613         trie->jump[curword] = (U16)(noper_next - convert);      \
2614         if (!jumper)                                            \
2615             jumper = noper_next;                                \
2616         if (!nextbranch)                                        \
2617             nextbranch= regnext(cur);                           \
2618     }                                                           \
2619                                                                 \
2620     if ( dupe ) {                                               \
2621         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2622         /* chain, so that when the bits of chain are later    */\
2623         /* linked together, the dups appear in the chain      */\
2624         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2625         trie->wordinfo[dupe].prev = curword;                    \
2626     } else {                                                    \
2627         /* we haven't inserted this word yet.                */ \
2628         trie->states[ state ].wordnum = curword;                \
2629     }                                                           \
2630 } STMT_END
2631
2632
2633 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2634      ( ( base + charid >=  ucharcount                                   \
2635          && base + charid < ubound                                      \
2636          && state == trie->trans[ base - ucharcount + charid ].check    \
2637          && trie->trans[ base - ucharcount + charid ].next )            \
2638            ? trie->trans[ base - ucharcount + charid ].next             \
2639            : ( state==1 ? special : 0 )                                 \
2640       )
2641
2642 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2643 STMT_START {                                                \
2644     TRIE_BITMAP_SET(trie, uvc);                             \
2645     /* store the folded codepoint */                        \
2646     if ( folder )                                           \
2647         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2648                                                             \
2649     if ( !UTF ) {                                           \
2650         /* store first byte of utf8 representation of */    \
2651         /* variant codepoints */                            \
2652         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2653             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2654         }                                                   \
2655     }                                                       \
2656 } STMT_END
2657 #define MADE_TRIE       1
2658 #define MADE_JUMP_TRIE  2
2659 #define MADE_EXACT_TRIE 4
2660
2661 STATIC I32
2662 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2663                   regnode *first, regnode *last, regnode *tail,
2664                   U32 word_count, U32 flags, U32 depth)
2665 {
2666     /* first pass, loop through and scan words */
2667     reg_trie_data *trie;
2668     HV *widecharmap = NULL;
2669     AV *revcharmap = newAV();
2670     regnode *cur;
2671     STRLEN len = 0;
2672     UV uvc = 0;
2673     U16 curword = 0;
2674     U32 next_alloc = 0;
2675     regnode *jumper = NULL;
2676     regnode *nextbranch = NULL;
2677     regnode *convert = NULL;
2678     U32 *prev_states; /* temp array mapping each state to previous one */
2679     /* we just use folder as a flag in utf8 */
2680     const U8 * folder = NULL;
2681
2682     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2683      * which stands for one trie structure, one hash, optionally followed
2684      * by two arrays */
2685 #ifdef DEBUGGING
2686     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2687     AV *trie_words = NULL;
2688     /* along with revcharmap, this only used during construction but both are
2689      * useful during debugging so we store them in the struct when debugging.
2690      */
2691 #else
2692     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2693     STRLEN trie_charcount=0;
2694 #endif
2695     SV *re_trie_maxbuff;
2696     GET_RE_DEBUG_FLAGS_DECL;
2697
2698     PERL_ARGS_ASSERT_MAKE_TRIE;
2699 #ifndef DEBUGGING
2700     PERL_UNUSED_ARG(depth);
2701 #endif
2702
2703     switch (flags) {
2704         case EXACT: case EXACT_ONLY8: case EXACTL: break;
2705         case EXACTFAA:
2706         case EXACTFUP:
2707         case EXACTFU:
2708         case EXACTFLU8: folder = PL_fold_latin1; break;
2709         case EXACTF:  folder = PL_fold; break;
2710         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2711     }
2712
2713     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2714     trie->refcount = 1;
2715     trie->startstate = 1;
2716     trie->wordcount = word_count;
2717     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2718     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2719     if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
2720         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2721     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2722                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2723
2724     DEBUG_r({
2725         trie_words = newAV();
2726     });
2727
2728     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2729     assert(re_trie_maxbuff);
2730     if (!SvIOK(re_trie_maxbuff)) {
2731         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2732     }
2733     DEBUG_TRIE_COMPILE_r({
2734         Perl_re_indentf( aTHX_
2735           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2736           depth+1,
2737           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2738           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2739     });
2740
2741    /* Find the node we are going to overwrite */
2742     if ( first == startbranch && OP( last ) != BRANCH ) {
2743         /* whole branch chain */
2744         convert = first;
2745     } else {
2746         /* branch sub-chain */
2747         convert = NEXTOPER( first );
2748     }
2749
2750     /*  -- First loop and Setup --
2751
2752        We first traverse the branches and scan each word to determine if it
2753        contains widechars, and how many unique chars there are, this is
2754        important as we have to build a table with at least as many columns as we
2755        have unique chars.
2756
2757        We use an array of integers to represent the character codes 0..255
2758        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2759        the native representation of the character value as the key and IV's for
2760        the coded index.
2761
2762        *TODO* If we keep track of how many times each character is used we can
2763        remap the columns so that the table compression later on is more
2764        efficient in terms of memory by ensuring the most common value is in the
2765        middle and the least common are on the outside.  IMO this would be better
2766        than a most to least common mapping as theres a decent chance the most
2767        common letter will share a node with the least common, meaning the node
2768        will not be compressible. With a middle is most common approach the worst
2769        case is when we have the least common nodes twice.
2770
2771      */
2772
2773     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2774         regnode *noper = NEXTOPER( cur );
2775         const U8 *uc;
2776         const U8 *e;
2777         int foldlen = 0;
2778         U32 wordlen      = 0;         /* required init */
2779         STRLEN minchars = 0;
2780         STRLEN maxchars = 0;
2781         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2782                                                bitmap?*/
2783
2784         if (OP(noper) == NOTHING) {
2785             /* skip past a NOTHING at the start of an alternation
2786              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2787              */
2788             regnode *noper_next= regnext(noper);
2789             if (noper_next < tail)
2790                 noper= noper_next;
2791         }
2792
2793         if (    noper < tail
2794             && (    OP(noper) == flags
2795                 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2796                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
2797                                          || OP(noper) == EXACTFUP))))
2798         {
2799             uc= (U8*)STRING(noper);
2800             e= uc + STR_LEN(noper);
2801         } else {
2802             trie->minlen= 0;
2803             continue;
2804         }
2805
2806
2807         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2808             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2809                                           regardless of encoding */
2810             if (OP( noper ) == EXACTFUP) {
2811                 /* false positives are ok, so just set this */
2812                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2813             }
2814         }
2815
2816         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2817                                            branch */
2818             TRIE_CHARCOUNT(trie)++;
2819             TRIE_READ_CHAR;
2820
2821             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2822              * is in effect.  Under /i, this character can match itself, or
2823              * anything that folds to it.  If not under /i, it can match just
2824              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2825              * all fold to k, and all are single characters.   But some folds
2826              * expand to more than one character, so for example LATIN SMALL
2827              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2828              * the string beginning at 'uc' is 'ffi', it could be matched by
2829              * three characters, or just by the one ligature character. (It
2830              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2831              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2832              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2833              * match.)  The trie needs to know the minimum and maximum number
2834              * of characters that could match so that it can use size alone to
2835              * quickly reject many match attempts.  The max is simple: it is
2836              * the number of folded characters in this branch (since a fold is
2837              * never shorter than what folds to it. */
2838
2839             maxchars++;
2840
2841             /* And the min is equal to the max if not under /i (indicated by
2842              * 'folder' being NULL), or there are no multi-character folds.  If
2843              * there is a multi-character fold, the min is incremented just
2844              * once, for the character that folds to the sequence.  Each
2845              * character in the sequence needs to be added to the list below of
2846              * characters in the trie, but we count only the first towards the
2847              * min number of characters needed.  This is done through the
2848              * variable 'foldlen', which is returned by the macros that look
2849              * for these sequences as the number of bytes the sequence
2850              * occupies.  Each time through the loop, we decrement 'foldlen' by
2851              * how many bytes the current char occupies.  Only when it reaches
2852              * 0 do we increment 'minchars' or look for another multi-character
2853              * sequence. */
2854             if (folder == NULL) {
2855                 minchars++;
2856             }
2857             else if (foldlen > 0) {
2858                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2859             }
2860             else {
2861                 minchars++;
2862
2863                 /* See if *uc is the beginning of a multi-character fold.  If
2864                  * so, we decrement the length remaining to look at, to account
2865                  * for the current character this iteration.  (We can use 'uc'
2866                  * instead of the fold returned by TRIE_READ_CHAR because for
2867                  * non-UTF, the latin1_safe macro is smart enough to account
2868                  * for all the unfolded characters, and because for UTF, the
2869                  * string will already have been folded earlier in the
2870                  * compilation process */
2871                 if (UTF) {
2872                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2873                         foldlen -= UTF8SKIP(uc);
2874                     }
2875                 }
2876                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2877                     foldlen--;
2878                 }
2879             }
2880
2881             /* The current character (and any potential folds) should be added
2882              * to the possible matching characters for this position in this
2883              * branch */
2884             if ( uvc < 256 ) {
2885                 if ( folder ) {
2886                     U8 folded= folder[ (U8) uvc ];
2887                     if ( !trie->charmap[ folded ] ) {
2888                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2889                         TRIE_STORE_REVCHAR( folded );
2890                     }
2891                 }
2892                 if ( !trie->charmap[ uvc ] ) {
2893                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2894                     TRIE_STORE_REVCHAR( uvc );
2895                 }
2896                 if ( set_bit ) {
2897                     /* store the codepoint in the bitmap, and its folded
2898                      * equivalent. */
2899                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2900                     set_bit = 0; /* We've done our bit :-) */
2901                 }
2902             } else {
2903
2904                 /* XXX We could come up with the list of code points that fold
2905                  * to this using PL_utf8_foldclosures, except not for
2906                  * multi-char folds, as there may be multiple combinations
2907                  * there that could work, which needs to wait until runtime to
2908                  * resolve (The comment about LIGATURE FFI above is such an
2909                  * example */
2910
2911                 SV** svpp;
2912                 if ( !widecharmap )
2913                     widecharmap = newHV();
2914
2915                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2916
2917                 if ( !svpp )
2918                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2919
2920                 if ( !SvTRUE( *svpp ) ) {
2921                     sv_setiv( *svpp, ++trie->uniquecharcount );
2922                     TRIE_STORE_REVCHAR(uvc);
2923                 }
2924             }
2925         } /* end loop through characters in this branch of the trie */
2926
2927         /* We take the min and max for this branch and combine to find the min
2928          * and max for all branches processed so far */
2929         if( cur == first ) {
2930             trie->minlen = minchars;
2931             trie->maxlen = maxchars;
2932         } else if (minchars < trie->minlen) {
2933             trie->minlen = minchars;
2934         } else if (maxchars > trie->maxlen) {
2935             trie->maxlen = maxchars;
2936         }
2937     } /* end first pass */
2938     DEBUG_TRIE_COMPILE_r(
2939         Perl_re_indentf( aTHX_
2940                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2941                 depth+1,
2942                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2943                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2944                 (int)trie->minlen, (int)trie->maxlen )
2945     );
2946
2947     /*
2948         We now know what we are dealing with in terms of unique chars and
2949         string sizes so we can calculate how much memory a naive
2950         representation using a flat table  will take. If it's over a reasonable
2951         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2952         conservative but potentially much slower representation using an array
2953         of lists.
2954
2955         At the end we convert both representations into the same compressed
2956         form that will be used in regexec.c for matching with. The latter
2957         is a form that cannot be used to construct with but has memory
2958         properties similar to the list form and access properties similar
2959         to the table form making it both suitable for fast searches and
2960         small enough that its feasable to store for the duration of a program.
2961
2962         See the comment in the code where the compressed table is produced
2963         inplace from the flat tabe representation for an explanation of how
2964         the compression works.
2965
2966     */
2967
2968
2969     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2970     prev_states[1] = 0;
2971
2972     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2973                                                     > SvIV(re_trie_maxbuff) )
2974     {
2975         /*
2976             Second Pass -- Array Of Lists Representation
2977
2978             Each state will be represented by a list of charid:state records
2979             (reg_trie_trans_le) the first such element holds the CUR and LEN
2980             points of the allocated array. (See defines above).
2981
2982             We build the initial structure using the lists, and then convert
2983             it into the compressed table form which allows faster lookups
2984             (but cant be modified once converted).
2985         */
2986
2987         STRLEN transcount = 1;
2988
2989         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2990             depth+1));
2991
2992         trie->states = (reg_trie_state *)
2993             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2994                                   sizeof(reg_trie_state) );
2995         TRIE_LIST_NEW(1);
2996         next_alloc = 2;
2997
2998         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2999
3000             regnode *noper   = NEXTOPER( cur );
3001             U32 state        = 1;         /* required init */
3002             U16 charid       = 0;         /* sanity init */
3003             U32 wordlen      = 0;         /* required init */
3004
3005             if (OP(noper) == NOTHING) {
3006                 regnode *noper_next= regnext(noper);
3007                 if (noper_next < tail)
3008                     noper= noper_next;
3009             }
3010
3011             if (    noper < tail
3012                 && (    OP(noper) == flags
3013                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3014                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
3015                                              || OP(noper) == EXACTFUP))))
3016             {
3017                 const U8 *uc= (U8*)STRING(noper);
3018                 const U8 *e= uc + STR_LEN(noper);
3019
3020                 for ( ; uc < e ; uc += len ) {
3021
3022                     TRIE_READ_CHAR;
3023
3024                     if ( uvc < 256 ) {
3025                         charid = trie->charmap[ uvc ];
3026                     } else {
3027                         SV** const svpp = hv_fetch( widecharmap,
3028                                                     (char*)&uvc,
3029                                                     sizeof( UV ),
3030                                                     0);
3031                         if ( !svpp ) {
3032                             charid = 0;
3033                         } else {
3034                             charid=(U16)SvIV( *svpp );
3035                         }
3036                     }
3037                     /* charid is now 0 if we dont know the char read, or
3038                      * nonzero if we do */
3039                     if ( charid ) {
3040
3041                         U16 check;
3042                         U32 newstate = 0;
3043
3044                         charid--;
3045                         if ( !trie->states[ state ].trans.list ) {
3046                             TRIE_LIST_NEW( state );
3047                         }
3048                         for ( check = 1;
3049                               check <= TRIE_LIST_USED( state );
3050                               check++ )
3051                         {
3052                             if ( TRIE_LIST_ITEM( state, check ).forid
3053                                                                     == charid )
3054                             {
3055                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3056                                 break;
3057                             }
3058                         }
3059                         if ( ! newstate ) {
3060                             newstate = next_alloc++;
3061                             prev_states[newstate] = state;
3062                             TRIE_LIST_PUSH( state, charid, newstate );
3063                             transcount++;
3064                         }
3065                         state = newstate;
3066                     } else {
3067                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3068                     }
3069                 }
3070             }
3071             TRIE_HANDLE_WORD(state);
3072
3073         } /* end second pass */
3074
3075         /* next alloc is the NEXT state to be allocated */
3076         trie->statecount = next_alloc;
3077         trie->states = (reg_trie_state *)
3078             PerlMemShared_realloc( trie->states,
3079                                    next_alloc
3080                                    * sizeof(reg_trie_state) );
3081
3082         /* and now dump it out before we compress it */
3083         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3084                                                          revcharmap, next_alloc,
3085                                                          depth+1)
3086         );
3087
3088         trie->trans = (reg_trie_trans *)
3089             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3090         {
3091             U32 state;
3092             U32 tp = 0;
3093             U32 zp = 0;
3094
3095
3096             for( state=1 ; state < next_alloc ; state ++ ) {
3097                 U32 base=0;
3098
3099                 /*
3100                 DEBUG_TRIE_COMPILE_MORE_r(
3101                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3102                 );
3103                 */
3104
3105                 if (trie->states[state].trans.list) {
3106                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3107                     U16 maxid=minid;
3108                     U16 idx;
3109
3110                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3111                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3112                         if ( forid < minid ) {
3113                             minid=forid;
3114                         } else if ( forid > maxid ) {
3115                             maxid=forid;
3116                         }
3117                     }
3118                     if ( transcount < tp + maxid - minid + 1) {
3119                         transcount *= 2;
3120                         trie->trans = (reg_trie_trans *)
3121                             PerlMemShared_realloc( trie->trans,
3122                                                      transcount
3123                                                      * sizeof(reg_trie_trans) );
3124                         Zero( trie->trans + (transcount / 2),
3125                               transcount / 2,
3126                               reg_trie_trans );
3127                     }
3128                     base = trie->uniquecharcount + tp - minid;
3129                     if ( maxid == minid ) {
3130                         U32 set = 0;
3131                         for ( ; zp < tp ; zp++ ) {
3132                             if ( ! trie->trans[ zp ].next ) {
3133                                 base = trie->uniquecharcount + zp - minid;
3134                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3135                                                                    1).newstate;
3136                                 trie->trans[ zp ].check = state;
3137                                 set = 1;
3138                                 break;
3139                             }
3140                         }
3141                         if ( !set ) {
3142                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3143                                                                    1).newstate;
3144                             trie->trans[ tp ].check = state;
3145                             tp++;
3146                             zp = tp;
3147                         }
3148                     } else {
3149                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3150                             const U32 tid = base
3151                                            - trie->uniquecharcount
3152                                            + TRIE_LIST_ITEM( state, idx ).forid;
3153                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3154                                                                 idx ).newstate;
3155                             trie->trans[ tid ].check = state;
3156                         }
3157                         tp += ( maxid - minid + 1 );
3158                     }
3159                     Safefree(trie->states[ state ].trans.list);
3160                 }
3161                 /*
3162                 DEBUG_TRIE_COMPILE_MORE_r(
3163                     Perl_re_printf( aTHX_  " base: %d\n",base);
3164                 );
3165                 */
3166                 trie->states[ state ].trans.base=base;
3167             }
3168             trie->lasttrans = tp + 1;
3169         }
3170     } else {
3171         /*
3172            Second Pass -- Flat Table Representation.
3173
3174            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3175            each.  We know that we will need Charcount+1 trans at most to store
3176            the data (one row per char at worst case) So we preallocate both
3177            structures assuming worst case.
3178
3179            We then construct the trie using only the .next slots of the entry
3180            structs.
3181
3182            We use the .check field of the first entry of the node temporarily
3183            to make compression both faster and easier by keeping track of how
3184            many non zero fields are in the node.
3185
3186            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3187            transition.
3188
3189            There are two terms at use here: state as a TRIE_NODEIDX() which is
3190            a number representing the first entry of the node, and state as a
3191            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3192            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3193            if there are 2 entrys per node. eg:
3194
3195              A B       A B
3196           1. 2 4    1. 3 7
3197           2. 0 3    3. 0 5
3198           3. 0 0    5. 0 0
3199           4. 0 0    7. 0 0
3200
3201            The table is internally in the right hand, idx form. However as we
3202            also have to deal with the states array which is indexed by nodenum
3203            we have to use TRIE_NODENUM() to convert.
3204
3205         */
3206         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3207             depth+1));
3208
3209         trie->trans = (reg_trie_trans *)
3210             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3211                                   * trie->uniquecharcount + 1,
3212                                   sizeof(reg_trie_trans) );
3213         trie->states = (reg_trie_state *)
3214             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3215                                   sizeof(reg_trie_state) );
3216         next_alloc = trie->uniquecharcount + 1;
3217
3218
3219         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3220
3221             regnode *noper   = NEXTOPER( cur );
3222
3223             U32 state        = 1;         /* required init */
3224
3225             U16 charid       = 0;         /* sanity init */
3226             U32 accept_state = 0;         /* sanity init */
3227
3228             U32 wordlen      = 0;         /* required init */
3229
3230             if (OP(noper) == NOTHING) {
3231                 regnode *noper_next= regnext(noper);
3232                 if (noper_next < tail)
3233                     noper= noper_next;
3234             }
3235
3236             if (    noper < tail
3237                 && (    OP(noper) == flags
3238                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3239                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
3240                                              || OP(noper) == EXACTFUP))))
3241             {
3242                 const U8 *uc= (U8*)STRING(noper);
3243                 const U8 *e= uc + STR_LEN(noper);
3244
3245                 for ( ; uc < e ; uc += len ) {
3246
3247                     TRIE_READ_CHAR;
3248
3249                     if ( uvc < 256 ) {
3250                         charid = trie->charmap[ uvc ];
3251                     } else {
3252                         SV* const * const svpp = hv_fetch( widecharmap,
3253                                                            (char*)&uvc,
3254                                                            sizeof( UV ),
3255                                                            0);
3256                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3257                     }
3258                     if ( charid ) {
3259                         charid--;
3260                         if ( !trie->trans[ state + charid ].next ) {
3261                             trie->trans[ state + charid ].next = next_alloc;
3262                             trie->trans[ state ].check++;
3263                             prev_states[TRIE_NODENUM(next_alloc)]
3264                                     = TRIE_NODENUM(state);
3265                             next_alloc += trie->uniquecharcount;
3266                         }
3267                         state = trie->trans[ state + charid ].next;
3268                     } else {
3269                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3270                     }
3271                     /* charid is now 0 if we dont know the char read, or
3272                      * nonzero if we do */
3273                 }
3274             }
3275             accept_state = TRIE_NODENUM( state );
3276             TRIE_HANDLE_WORD(accept_state);
3277
3278         } /* end second pass */
3279
3280         /* and now dump it out before we compress it */
3281         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3282                                                           revcharmap,
3283                                                           next_alloc, depth+1));
3284
3285         {
3286         /*
3287            * Inplace compress the table.*
3288
3289            For sparse data sets the table constructed by the trie algorithm will
3290            be mostly 0/FAIL transitions or to put it another way mostly empty.
3291            (Note that leaf nodes will not contain any transitions.)
3292
3293            This algorithm compresses the tables by eliminating most such
3294            transitions, at the cost of a modest bit of extra work during lookup:
3295
3296            - Each states[] entry contains a .base field which indicates the
3297            index in the state[] array wheres its transition data is stored.
3298
3299            - If .base is 0 there are no valid transitions from that node.
3300
3301            - If .base is nonzero then charid is added to it to find an entry in
3302            the trans array.
3303
3304            -If trans[states[state].base+charid].check!=state then the
3305            transition is taken to be a 0/Fail transition. Thus if there are fail
3306            transitions at the front of the node then the .base offset will point
3307            somewhere inside the previous nodes data (or maybe even into a node
3308            even earlier), but the .check field determines if the transition is
3309            valid.
3310
3311            XXX - wrong maybe?
3312            The following process inplace converts the table to the compressed
3313            table: We first do not compress the root node 1,and mark all its
3314            .check pointers as 1 and set its .base pointer as 1 as well. This
3315            allows us to do a DFA construction from the compressed table later,
3316            and ensures that any .base pointers we calculate later are greater
3317            than 0.
3318
3319            - We set 'pos' to indicate the first entry of the second node.
3320
3321            - We then iterate over the columns of the node, finding the first and
3322            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3323            and set the .check pointers accordingly, and advance pos
3324            appropriately and repreat for the next node. Note that when we copy
3325            the next pointers we have to convert them from the original
3326            NODEIDX form to NODENUM form as the former is not valid post
3327            compression.
3328
3329            - If a node has no transitions used we mark its base as 0 and do not
3330            advance the pos pointer.
3331
3332            - If a node only has one transition we use a second pointer into the
3333            structure to fill in allocated fail transitions from other states.
3334            This pointer is independent of the main pointer and scans forward
3335            looking for null transitions that are allocated to a state. When it
3336            finds one it writes the single transition into the "hole".  If the
3337            pointer doesnt find one the single transition is appended as normal.
3338
3339            - Once compressed we can Renew/realloc the structures to release the
3340            excess space.
3341
3342            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3343            specifically Fig 3.47 and the associated pseudocode.
3344
3345            demq
3346         */
3347         const U32 laststate = TRIE_NODENUM( next_alloc );
3348         U32 state, charid;
3349         U32 pos = 0, zp=0;
3350         trie->statecount = laststate;
3351
3352         for ( state = 1 ; state < laststate ; state++ ) {
3353             U8 flag = 0;
3354             const U32 stateidx = TRIE_NODEIDX( state );
3355             const U32 o_used = trie->trans[ stateidx ].check;
3356             U32 used = trie->trans[ stateidx ].check;
3357             trie->trans[ stateidx ].check = 0;
3358
3359             for ( charid = 0;
3360                   used && charid < trie->uniquecharcount;
3361                   charid++ )
3362             {
3363                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3364                     if ( trie->trans[ stateidx + charid ].next ) {
3365                         if (o_used == 1) {
3366                             for ( ; zp < pos ; zp++ ) {
3367                                 if ( ! trie->trans[ zp ].next ) {
3368                                     break;
3369                                 }
3370                             }
3371                             trie->states[ state ].trans.base
3372                                                     = zp
3373                                                       + trie->uniquecharcount
3374                                                       - charid ;
3375                             trie->trans[ zp ].next
3376                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3377                                                              + charid ].next );
3378                             trie->trans[ zp ].check = state;
3379                             if ( ++zp > pos ) pos = zp;
3380                             break;
3381                         }
3382                         used--;
3383                     }
3384                     if ( !flag ) {
3385                         flag = 1;
3386                         trie->states[ state ].trans.base
3387                                        = pos + trie->uniquecharcount - charid ;
3388                     }
3389                     trie->trans[ pos ].next
3390                         = SAFE_TRIE_NODENUM(
3391                                        trie->trans[ stateidx + charid ].next );
3392                     trie->trans[ pos ].check = state;
3393                     pos++;
3394                 }
3395             }
3396         }
3397         trie->lasttrans = pos + 1;
3398         trie->states = (reg_trie_state *)
3399             PerlMemShared_realloc( trie->states, laststate
3400                                    * sizeof(reg_trie_state) );
3401         DEBUG_TRIE_COMPILE_MORE_r(
3402             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3403                 depth+1,
3404                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3405                        + 1 ),
3406                 (IV)next_alloc,
3407                 (IV)pos,
3408                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3409             );
3410
3411         } /* end table compress */
3412     }
3413     DEBUG_TRIE_COMPILE_MORE_r(
3414             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3415                 depth+1,
3416                 (UV)trie->statecount,
3417                 (UV)trie->lasttrans)
3418     );
3419     /* resize the trans array to remove unused space */
3420     trie->trans = (reg_trie_trans *)
3421         PerlMemShared_realloc( trie->trans, trie->lasttrans
3422                                * sizeof(reg_trie_trans) );
3423
3424     {   /* Modify the program and insert the new TRIE node */
3425         U8 nodetype =(U8)(flags & 0xFF);
3426         char *str=NULL;
3427
3428 #ifdef DEBUGGING
3429         regnode *optimize = NULL;
3430 #ifdef RE_TRACK_PATTERN_OFFSETS
3431
3432         U32 mjd_offset = 0;
3433         U32 mjd_nodelen = 0;
3434 #endif /* RE_TRACK_PATTERN_OFFSETS */
3435 #endif /* DEBUGGING */
3436         /*
3437            This means we convert either the first branch or the first Exact,
3438            depending on whether the thing following (in 'last') is a branch
3439            or not and whther first is the startbranch (ie is it a sub part of
3440            the alternation or is it the whole thing.)
3441            Assuming its a sub part we convert the EXACT otherwise we convert
3442            the whole branch sequence, including the first.
3443          */
3444         /* Find the node we are going to overwrite */
3445         if ( first != startbranch || OP( last ) == BRANCH ) {
3446             /* branch sub-chain */
3447             NEXT_OFF( first ) = (U16)(last - first);
3448 #ifdef RE_TRACK_PATTERN_OFFSETS
3449             DEBUG_r({
3450                 mjd_offset= Node_Offset((convert));
3451                 mjd_nodelen= Node_Length((convert));
3452             });
3453 #endif
3454             /* whole branch chain */
3455         }
3456 #ifdef RE_TRACK_PATTERN_OFFSETS
3457         else {
3458             DEBUG_r({
3459                 const  regnode *nop = NEXTOPER( convert );
3460                 mjd_offset= Node_Offset((nop));
3461                 mjd_nodelen= Node_Length((nop));
3462             });
3463         }
3464         DEBUG_OPTIMISE_r(
3465             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3466                 depth+1,
3467                 (UV)mjd_offset, (UV)mjd_nodelen)
3468         );
3469 #endif
3470         /* But first we check to see if there is a common prefix we can
3471            split out as an EXACT and put in front of the TRIE node.  */
3472         trie->startstate= 1;
3473         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3474             /* we want to find the first state that has more than
3475              * one transition, if that state is not the first state
3476              * then we have a common prefix which we can remove.
3477              */
3478             U32 state;
3479             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3480                 U32 ofs = 0;
3481                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3482                                        transition, -1 means none */
3483                 U32 count = 0;
3484                 const U32 base = trie->states[ state ].trans.base;
3485
3486                 /* does this state terminate an alternation? */
3487                 if ( trie->states[state].wordnum )
3488                         count = 1;
3489
3490                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3491                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3492                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3493                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3494                     {
3495                         if ( ++count > 1 ) {
3496                             /* we have more than one transition */
3497                             SV **tmp;
3498                             U8 *ch;
3499                             /* if this is the first state there is no common prefix
3500                              * to extract, so we can exit */
3501                             if ( state == 1 ) break;
3502                             tmp = av_fetch( revcharmap, ofs, 0);
3503                             ch = (U8*)SvPV_nolen_const( *tmp );
3504
3505                             /* if we are on count 2 then we need to initialize the
3506                              * bitmap, and store the previous char if there was one
3507                              * in it*/
3508                             if ( count == 2 ) {
3509                                 /* clear the bitmap */
3510                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3511                                 DEBUG_OPTIMISE_r(
3512                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3513                                         depth+1,
3514                                         (UV)state));
3515                                 if (first_ofs >= 0) {
3516                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3517                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3518
3519                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3520                                     DEBUG_OPTIMISE_r(
3521                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3522                                     );
3523                                 }
3524                             }
3525                             /* store the current firstchar in the bitmap */
3526                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3527                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3528                         }
3529                         first_ofs = ofs;
3530                     }
3531                 }
3532                 if ( count == 1 ) {
3533                     /* This state has only one transition, its transition is part
3534                      * of a common prefix - we need to concatenate the char it
3535                      * represents to what we have so far. */
3536                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3537                     STRLEN len;
3538                     char *ch = SvPV( *tmp, len );
3539                     DEBUG_OPTIMISE_r({
3540                         SV *sv=sv_newmortal();
3541                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3542                             depth+1,
3543                             (UV)state, (UV)first_ofs,
3544                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3545                                 PL_colors[0], PL_colors[1],
3546                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3547                                 PERL_PV_ESCAPE_FIRSTCHAR
3548                             )
3549                         );
3550                     });
3551                     if ( state==1 ) {
3552                         OP( convert ) = nodetype;
3553                         str=STRING(convert);
3554                         STR_LEN(convert)=0;
3555                     }
3556                     STR_LEN(convert) += len;
3557                     while (len--)
3558                         *str++ = *ch++;
3559                 } else {
3560 #ifdef DEBUGGING
3561                     if (state>1)
3562                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3563 #endif
3564                     break;
3565                 }
3566             }
3567             trie->prefixlen = (state-1);
3568             if (str) {
3569                 regnode *n = convert+NODE_SZ_STR(convert);
3570                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3571                 trie->startstate = state;
3572                 trie->minlen -= (state - 1);
3573                 trie->maxlen -= (state - 1);
3574 #ifdef DEBUGGING
3575                /* At least the UNICOS C compiler choked on this
3576                 * being argument to DEBUG_r(), so let's just have
3577                 * it right here. */
3578                if (
3579 #ifdef PERL_EXT_RE_BUILD
3580                    1
3581 #else
3582                    DEBUG_r_TEST
3583 #endif
3584                    ) {
3585                    regnode *fix = convert;
3586                    U32 word = trie->wordcount;
3587 #ifdef RE_TRACK_PATTERN_OFFSETS
3588                    mjd_nodelen++;
3589 #endif
3590                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3591                    while( ++fix < n ) {
3592                        Set_Node_Offset_Length(fix, 0, 0);
3593                    }
3594                    while (word--) {
3595                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3596                        if (tmp) {
3597                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3598                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3599                            else
3600                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3601                        }
3602                    }
3603                }
3604 #endif
3605                 if (trie->maxlen) {
3606                     convert = n;
3607                 } else {
3608                     NEXT_OFF(convert) = (U16)(tail - convert);
3609                     DEBUG_r(optimize= n);
3610                 }
3611             }
3612         }
3613         if (!jumper)
3614             jumper = last;
3615         if ( trie->maxlen ) {
3616             NEXT_OFF( convert ) = (U16)(tail - convert);
3617             ARG_SET( convert, data_slot );
3618             /* Store the offset to the first unabsorbed branch in
3619                jump[0], which is otherwise unused by the jump logic.
3620                We use this when dumping a trie and during optimisation. */
3621             if (trie->jump)
3622                 trie->jump[0] = (U16)(nextbranch - convert);
3623
3624             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3625              *   and there is a bitmap
3626              *   and the first "jump target" node we found leaves enough room
3627              * then convert the TRIE node into a TRIEC node, with the bitmap
3628              * embedded inline in the opcode - this is hypothetically faster.
3629              */
3630             if ( !trie->states[trie->startstate].wordnum
3631                  && trie->bitmap
3632                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3633             {
3634                 OP( convert ) = TRIEC;
3635                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3636                 PerlMemShared_free(trie->bitmap);
3637                 trie->bitmap= NULL;
3638             } else
3639                 OP( convert ) = TRIE;
3640
3641             /* store the type in the flags */
3642             convert->flags = nodetype;
3643             DEBUG_r({
3644             optimize = convert
3645                       + NODE_STEP_REGNODE
3646                       + regarglen[ OP( convert ) ];
3647             });
3648             /* XXX We really should free up the resource in trie now,
3649                    as we won't use them - (which resources?) dmq */
3650         }
3651         /* needed for dumping*/
3652         DEBUG_r(if (optimize) {
3653             regnode *opt = convert;
3654
3655             while ( ++opt < optimize) {
3656                 Set_Node_Offset_Length(opt, 0, 0);
3657             }
3658             /*
3659                 Try to clean up some of the debris left after the
3660                 optimisation.
3661              */
3662             while( optimize < jumper ) {
3663                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3664                 OP( optimize ) = OPTIMIZED;
3665                 Set_Node_Offset_Length(optimize, 0, 0);
3666                 optimize++;
3667             }
3668             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3669         });
3670     } /* end node insert */
3671
3672     /*  Finish populating the prev field of the wordinfo array.  Walk back
3673      *  from each accept state until we find another accept state, and if
3674      *  so, point the first word's .prev field at the second word. If the
3675      *  second already has a .prev field set, stop now. This will be the
3676      *  case either if we've already processed that word's accept state,
3677      *  or that state had multiple words, and the overspill words were
3678      *  already linked up earlier.
3679      */
3680     {
3681         U16 word;
3682         U32 state;
3683         U16 prev;
3684
3685         for (word=1; word <= trie->wordcount; word++) {
3686             prev = 0;
3687             if (trie->wordinfo[word].prev)
3688                 continue;
3689             state = trie->wordinfo[word].accept;
3690             while (state) {
3691                 state = prev_states[state];
3692                 if (!state)
3693                     break;
3694                 prev = trie->states[state].wordnum;
3695                 if (prev)
3696                     break;
3697             }
3698             trie->wordinfo[word].prev = prev;
3699         }
3700         Safefree(prev_states);
3701     }
3702
3703
3704     /* and now dump out the compressed format */
3705     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3706
3707     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3708 #ifdef DEBUGGING
3709     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3710     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3711 #else
3712     SvREFCNT_dec_NN(revcharmap);
3713 #endif
3714     return trie->jump
3715            ? MADE_JUMP_TRIE
3716            : trie->startstate>1
3717              ? MADE_EXACT_TRIE
3718              : MADE_TRIE;
3719 }
3720
3721 STATIC regnode *
3722 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3723 {
3724 /* The Trie is constructed and compressed now so we can build a fail array if
3725  * it's needed
3726
3727    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3728    3.32 in the
3729    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3730    Ullman 1985/88
3731    ISBN 0-201-10088-6
3732
3733    We find the fail state for each state in the trie, this state is the longest
3734    proper suffix of the current state's 'word' that is also a proper prefix of
3735    another word in our trie. State 1 represents the word '' and is thus the
3736    default fail state. This allows the DFA not to have to restart after its
3737    tried and failed a word at a given point, it simply continues as though it
3738    had been matching the other word in the first place.
3739    Consider
3740       'abcdgu'=~/abcdefg|cdgu/
3741    When we get to 'd' we are still matching the first word, we would encounter
3742    'g' which would fail, which would bring us to the state representing 'd' in
3743    the second word where we would try 'g' and succeed, proceeding to match
3744    'cdgu'.
3745  */
3746  /* add a fail transition */
3747     const U32 trie_offset = ARG(source);
3748     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3749     U32 *q;
3750     const U32 ucharcount = trie->uniquecharcount;
3751     const U32 numstates = trie->statecount;
3752     const U32 ubound = trie->lasttrans + ucharcount;
3753     U32 q_read = 0;
3754     U32 q_write = 0;
3755     U32 charid;
3756     U32 base = trie->states[ 1 ].trans.base;
3757     U32 *fail;
3758     reg_ac_data *aho;
3759     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3760     regnode *stclass;
3761     GET_RE_DEBUG_FLAGS_DECL;
3762
3763     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3764     PERL_UNUSED_CONTEXT;
3765 #ifndef DEBUGGING
3766     PERL_UNUSED_ARG(depth);
3767 #endif
3768
3769     if ( OP(source) == TRIE ) {
3770         struct regnode_1 *op = (struct regnode_1 *)
3771             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3772         StructCopy(source, op, struct regnode_1);
3773         stclass = (regnode *)op;
3774     } else {
3775         struct regnode_charclass *op = (struct regnode_charclass *)
3776             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3777         StructCopy(source, op, struct regnode_charclass);
3778         stclass = (regnode *)op;
3779     }
3780     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3781
3782     ARG_SET( stclass, data_slot );
3783     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3784     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3785     aho->trie=trie_offset;
3786     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3787     Copy( trie->states, aho->states, numstates, reg_trie_state );
3788     Newx( q, numstates, U32);
3789     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3790     aho->refcount = 1;
3791     fail = aho->fail;
3792     /* initialize fail[0..1] to be 1 so that we always have
3793        a valid final fail state */
3794     fail[ 0 ] = fail[ 1 ] = 1;
3795
3796     for ( charid = 0; charid < ucharcount ; charid++ ) {
3797         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3798         if ( newstate ) {
3799             q[ q_write ] = newstate;
3800             /* set to point at the root */
3801             fail[ q[ q_write++ ] ]=1;
3802         }
3803     }
3804     while ( q_read < q_write) {
3805         const U32 cur = q[ q_read++ % numstates ];
3806         base = trie->states[ cur ].trans.base;
3807
3808         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3809             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3810             if (ch_state) {
3811                 U32 fail_state = cur;
3812                 U32 fail_base;
3813                 do {
3814                     fail_state = fail[ fail_state ];
3815                     fail_base = aho->states[ fail_state ].trans.base;
3816                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3817
3818                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3819                 fail[ ch_state ] = fail_state;
3820                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3821                 {
3822                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3823                 }
3824                 q[ q_write++ % numstates] = ch_state;
3825             }
3826         }
3827     }
3828     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3829        when we fail in state 1, this allows us to use the
3830        charclass scan to find a valid start char. This is based on the principle
3831        that theres a good chance the string being searched contains lots of stuff
3832        that cant be a start char.
3833      */
3834     fail[ 0 ] = fail[ 1 ] = 0;
3835     DEBUG_TRIE_COMPILE_r({
3836         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3837                       depth, (UV)numstates
3838         );
3839         for( q_read=1; q_read<numstates; q_read++ ) {
3840             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3841         }
3842         Perl_re_printf( aTHX_  "\n");
3843     });
3844     Safefree(q);
3845     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3846     return stclass;
3847 }
3848
3849
3850 /* The below joins as many adjacent EXACTish nodes as possible into a single
3851  * one.  The regop may be changed if the node(s) contain certain sequences that
3852  * require special handling.  The joining is only done if:
3853  * 1) there is room in the current conglomerated node to entirely contain the
3854  *    next one.
3855  * 2) they are compatible node types
3856  *
3857  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3858  * these get optimized out
3859  *
3860  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3861  * as possible, even if that means splitting an existing node so that its first
3862  * part is moved to the preceeding node.  This would maximise the efficiency of
3863  * memEQ during matching.
3864  *
3865  * If a node is to match under /i (folded), the number of characters it matches
3866  * can be different than its character length if it contains a multi-character
3867  * fold.  *min_subtract is set to the total delta number of characters of the
3868  * input nodes.
3869  *
3870  * And *unfolded_multi_char is set to indicate whether or not the node contains
3871  * an unfolded multi-char fold.  This happens when it won't be known until
3872  * runtime whether the fold is valid or not; namely
3873  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3874  *      target string being matched against turns out to be UTF-8 is that fold
3875  *      valid; or
3876  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3877  *      runtime.
3878  * (Multi-char folds whose components are all above the Latin1 range are not
3879  * run-time locale dependent, and have already been folded by the time this
3880  * function is called.)
3881  *
3882  * This is as good a place as any to discuss the design of handling these
3883  * multi-character fold sequences.  It's been wrong in Perl for a very long
3884  * time.  There are three code points in Unicode whose multi-character folds
3885  * were long ago discovered to mess things up.  The previous designs for
3886  * dealing with these involved assigning a special node for them.  This
3887  * approach doesn't always work, as evidenced by this example:
3888  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3889  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3890  * would match just the \xDF, it won't be able to handle the case where a
3891  * successful match would have to cross the node's boundary.  The new approach
3892  * that hopefully generally solves the problem generates an EXACTFUP node
3893  * that is "sss" in this case.
3894  *
3895  * It turns out that there are problems with all multi-character folds, and not
3896  * just these three.  Now the code is general, for all such cases.  The
3897  * approach taken is:
3898  * 1)   This routine examines each EXACTFish node that could contain multi-
3899  *      character folded sequences.  Since a single character can fold into
3900  *      such a sequence, the minimum match length for this node is less than
3901  *      the number of characters in the node.  This routine returns in
3902  *      *min_subtract how many characters to subtract from the the actual
3903  *      length of the string to get a real minimum match length; it is 0 if
3904  *      there are no multi-char foldeds.  This delta is used by the caller to
3905  *      adjust the min length of the match, and the delta between min and max,
3906  *      so that the optimizer doesn't reject these possibilities based on size
3907  *      constraints.
3908  *
3909  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3910  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
3911  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3912  *      EXACTFU nodes.  The node type of such nodes is then changed to
3913  *      EXACTFUP, indicating it is problematic, and needs careful handling.
3914  *      (The procedures in step 1) above are sufficient to handle this case in
3915  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
3916  *      the only case where there is a possible fold length change in non-UTF-8
3917  *      patterns.  By reserving a special node type for problematic cases, the
3918  *      far more common regular EXACTFU nodes can be processed faster.
3919  *      regexec.c takes advantage of this.
3920  *
3921  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3922  *      problematic cases.   These all only occur when the pattern is not
3923  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
3924  *      length change, it handles the situation where the string cannot be
3925  *      entirely folded.  The strings in an EXACTFish node are folded as much
3926  *      as possible during compilation in regcomp.c.  This saves effort in
3927  *      regex matching.  By using an EXACTFUP node when it is not possible to
3928  *      fully fold at compile time, regexec.c can know that everything in an
3929  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
3930  *      case where folding in EXACTFU nodes can't be done at compile time is
3931  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
3932  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
3933  *      handle two very different cases.  Alternatively, there could have been
3934  *      a node type where there are length changes, one for unfolded, and one
3935  *      for both.  If yet another special case needed to be created, the number
3936  *      of required node types would have to go to 7.  khw figures that even
3937  *      though there are plenty of node types to spare, that the maintenance
3938  *      cost wasn't worth the small speedup of doing it that way, especially
3939  *      since he thinks the MICRO SIGN is rarely encountered in practice.
3940  *
3941  *      There are other cases where folding isn't done at compile time, but
3942  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
3943  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
3944  *      changes.  Some folds in EXACTF depend on if the runtime target string
3945  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
3946  *      when no fold in it depends on the UTF-8ness of the target string.)
3947  *
3948  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3949  *      validity of the fold won't be known until runtime, and so must remain
3950  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
3951  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3952  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3953  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3954  *      The reason this is a problem is that the optimizer part of regexec.c
3955  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3956  *      that a character in the pattern corresponds to at most a single
3957  *      character in the target string.  (And I do mean character, and not byte
3958  *      here, unlike other parts of the documentation that have never been
3959  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
3960  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3961  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
3962  *      EXACTFL nodes, violate the assumption, and they are the only instances
3963  *      where it is violated.  I'm reluctant to try to change the assumption,
3964  *      as the code involved is impenetrable to me (khw), so instead the code
3965  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
3966  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3967  *      boolean indicating whether or not the node contains such a fold.  When
3968  *      it is true, the caller sets a flag that later causes the optimizer in
3969  *      this file to not set values for the floating and fixed string lengths,
3970  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3971  *      assumption.  Thus, there is no optimization based on string lengths for
3972  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3973  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
3974  *      assumption is wrong only in these cases is that all other non-UTF-8
3975  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3976  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3977  *      EXACTF nodes because we don't know at compile time if it actually
3978  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3979  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3980  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
3981  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3982  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3983  *      string would require the pattern to be forced into UTF-8, the overhead
3984  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3985  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3986  *      locale.)
3987  *
3988  *      Similarly, the code that generates tries doesn't currently handle
3989  *      not-already-folded multi-char folds, and it looks like a pain to change
3990  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
3991  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
3992  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
3993  *      using /iaa matching will be doing so almost entirely with ASCII
3994  *      strings, so this should rarely be encountered in practice */
3995
3996 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3997     if (PL_regkind[OP(scan)] == EXACT) \
3998         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
3999
4000 STATIC U32
4001 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4002                    UV *min_subtract, bool *unfolded_multi_char,
4003                    U32 flags, regnode *val, U32 depth)
4004 {
4005     /* Merge several consecutive EXACTish nodes into one. */
4006
4007     regnode *n = regnext(scan);
4008     U32 stringok = 1;
4009     regnode *next = scan + NODE_SZ_STR(scan);
4010     U32 merged = 0;
4011     U32 stopnow = 0;
4012 #ifdef DEBUGGING
4013     regnode *stop = scan;
4014     GET_RE_DEBUG_FLAGS_DECL;
4015 #else
4016     PERL_UNUSED_ARG(depth);
4017 #endif
4018
4019     PERL_ARGS_ASSERT_JOIN_EXACT;
4020 #ifndef EXPERIMENTAL_INPLACESCAN
4021     PERL_UNUSED_ARG(flags);
4022     PERL_UNUSED_ARG(val);
4023 #endif
4024     DEBUG_PEEP("join", scan, depth, 0);
4025
4026     assert(PL_regkind[OP(scan)] == EXACT);
4027
4028     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4029      * EXACT ones that are mergeable to the current one. */
4030     while (    n
4031            && (    PL_regkind[OP(n)] == NOTHING
4032                || (stringok && PL_regkind[OP(n)] == EXACT))
4033            && NEXT_OFF(n)
4034            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4035     {
4036
4037         if (OP(n) == TAIL || n > next)
4038             stringok = 0;
4039         if (PL_regkind[OP(n)] == NOTHING) {
4040             DEBUG_PEEP("skip:", n, depth, 0);
4041             NEXT_OFF(scan) += NEXT_OFF(n);
4042             next = n + NODE_STEP_REGNODE;
4043 #ifdef DEBUGGING
4044             if (stringok)
4045                 stop = n;
4046 #endif
4047             n = regnext(n);
4048         }
4049         else if (stringok) {
4050             const unsigned int oldl = STR_LEN(scan);
4051             regnode * const nnext = regnext(n);
4052
4053             /* XXX I (khw) kind of doubt that this works on platforms (should
4054              * Perl ever run on one) where U8_MAX is above 255 because of lots
4055              * of other assumptions */
4056             /* Don't join if the sum can't fit into a single node */
4057             if (oldl + STR_LEN(n) > U8_MAX)
4058                 break;
4059
4060             /* Joining something that requires UTF-8 with something that
4061              * doesn't, means the result requires UTF-8. */
4062             if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
4063                 OP(scan) = EXACT_ONLY8;
4064             }
4065             else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
4066                 ;   /* join is compatible, no need to change OP */
4067             }
4068             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
4069                 OP(scan) = EXACTFU_ONLY8;
4070             }
4071             else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
4072                 ;   /* join is compatible, no need to change OP */
4073             }
4074             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4075                 ;   /* join is compatible, no need to change OP */
4076             }
4077             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4078
4079                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4080                   * which can join with EXACTFU ones.  We check for this case
4081                   * here.  These need to be resolved to either EXACTFU or
4082                   * EXACTF at joining time.  They have nothing in them that
4083                   * would forbid them from being the more desirable EXACTFU
4084                   * nodes except that they begin and/or end with a single [Ss].
4085                   * The reason this is problematic is because they could be
4086                   * joined in this loop with an adjacent node that ends and/or
4087                   * begins with [Ss] which would then form the sequence 'ss',
4088                   * which matches differently under /di than /ui, in which case
4089                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4090                   * formed, the nodes get absorbed into any adjacent EXACTFU
4091                   * node.  And if the only adjacent node is EXACTF, they get
4092                   * absorbed into that, under the theory that a longer node is
4093                   * better than two shorter ones, even if one is EXACTFU.  Note
4094                   * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
4095                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4096
4097                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4098
4099                     /* Here the joined node would end with 's'.  If the node
4100                      * following the combination is an EXACTF one, it's better to
4101                      * join this trailing edge 's' node with that one, leaving the
4102                      * current one in 'scan' be the more desirable EXACTFU */
4103                     if (OP(nnext) == EXACTF) {
4104                         break;
4105                     }
4106
4107                     OP(scan) = EXACTFU_S_EDGE;
4108
4109                 }   /* Otherwise, the beginning 's' of the 2nd node just
4110                        becomes an interior 's' in 'scan' */
4111             }
4112             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4113                 ;   /* join is compatible, no need to change OP */
4114             }
4115             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4116
4117                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4118                  * nodes.  But the latter nodes can be also joined with EXACTFU
4119                  * ones, and that is a better outcome, so if the node following
4120                  * 'n' is EXACTFU, quit now so that those two can be joined
4121                  * later */
4122                 if (OP(nnext) == EXACTFU) {
4123                     break;
4124                 }
4125
4126                 /* The join is compatible, and the combined node will be
4127                  * EXACTF.  (These don't care if they begin or end with 's' */
4128             }
4129             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4130                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4131                     && STRING(n)[0] == 's')
4132                 {
4133                     /* When combined, we have the sequence 'ss', which means we
4134                      * have to remain /di */
4135                     OP(scan) = EXACTF;
4136                 }
4137             }
4138             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4139                 if (STRING(n)[0] == 's') {
4140                     ;   /* Here the join is compatible and the combined node
4141                            starts with 's', no need to change OP */
4142                 }
4143                 else {  /* Now the trailing 's' is in the interior */
4144                     OP(scan) = EXACTFU;
4145                 }
4146             }
4147             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4148
4149                 /* The join is compatible, and the combined node will be
4150                  * EXACTF.  (These don't care if they begin or end with 's' */
4151                 OP(scan) = EXACTF;
4152             }
4153             else if (OP(scan) != OP(n)) {
4154
4155                 /* The only other compatible joinings are the same node type */
4156                 break;
4157             }
4158
4159             DEBUG_PEEP("merg", n, depth, 0);
4160             merged++;
4161
4162             NEXT_OFF(scan) += NEXT_OFF(n);
4163             STR_LEN(scan) += STR_LEN(n);
4164             next = n + NODE_SZ_STR(n);
4165             /* Now we can overwrite *n : */
4166             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4167 #ifdef DEBUGGING
4168             stop = next - 1;
4169 #endif
4170             n = nnext;
4171             if (stopnow) break;
4172         }
4173
4174 #ifdef EXPERIMENTAL_INPLACESCAN
4175         if (flags && !NEXT_OFF(n)) {
4176             DEBUG_PEEP("atch", val, depth, 0);
4177             if (reg_off_by_arg[OP(n)]) {
4178                 ARG_SET(n, val - n);
4179             }
4180             else {
4181                 NEXT_OFF(n) = val - n;
4182             }
4183             stopnow = 1;
4184         }
4185 #endif
4186     }
4187
4188     /* This temporary node can now be turned into EXACTFU, and must, as
4189      * regexec.c doesn't handle it */
4190     if (OP(scan) == EXACTFU_S_EDGE) {
4191         OP(scan) = EXACTFU;
4192     }
4193
4194     *min_subtract = 0;
4195     *unfolded_multi_char = FALSE;
4196
4197     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4198      * can now analyze for sequences of problematic code points.  (Prior to
4199      * this final joining, sequences could have been split over boundaries, and
4200      * hence missed).  The sequences only happen in folding, hence for any
4201      * non-EXACT EXACTish node */
4202     if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
4203         U8* s0 = (U8*) STRING(scan);
4204         U8* s = s0;
4205         U8* s_end = s0 + STR_LEN(scan);
4206
4207         int total_count_delta = 0;  /* Total delta number of characters that
4208                                        multi-char folds expand to */
4209
4210         /* One pass is made over the node's string looking for all the
4211          * possibilities.  To avoid some tests in the loop, there are two main
4212          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4213          * non-UTF-8 */
4214         if (UTF) {
4215             U8* folded = NULL;
4216
4217             if (OP(scan) == EXACTFL) {
4218                 U8 *d;
4219
4220                 /* An EXACTFL node would already have been changed to another
4221                  * node type unless there is at least one character in it that
4222                  * is problematic; likely a character whose fold definition
4223                  * won't be known until runtime, and so has yet to be folded.
4224                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4225                  * to handle the UTF-8 case, we need to create a temporary
4226                  * folded copy using UTF-8 locale rules in order to analyze it.
4227                  * This is because our macros that look to see if a sequence is
4228                  * a multi-char fold assume everything is folded (otherwise the
4229                  * tests in those macros would be too complicated and slow).
4230                  * Note that here, the non-problematic folds will have already
4231                  * been done, so we can just copy such characters.  We actually
4232                  * don't completely fold the EXACTFL string.  We skip the
4233                  * unfolded multi-char folds, as that would just create work
4234                  * below to figure out the size they already are */
4235
4236                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4237                 d = folded;
4238                 while (s < s_end) {
4239                     STRLEN s_len = UTF8SKIP(s);
4240                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4241                         Copy(s, d, s_len, U8);
4242                         d += s_len;
4243                     }
4244                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4245                         *unfolded_multi_char = TRUE;
4246                         Copy(s, d, s_len, U8);
4247                         d += s_len;
4248                     }
4249                     else if (isASCII(*s)) {
4250                         *(d++) = toFOLD(*s);
4251                     }
4252                     else {
4253                         STRLEN len;
4254                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4255                         d += len;
4256                     }
4257                     s += s_len;
4258                 }
4259
4260                 /* Point the remainder of the routine to look at our temporary
4261                  * folded copy */
4262                 s = folded;
4263                 s_end = d;
4264             } /* End of creating folded copy of EXACTFL string */
4265
4266             /* Examine the string for a multi-character fold sequence.  UTF-8
4267              * patterns have all characters pre-folded by the time this code is
4268              * executed */
4269             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4270                                      length sequence we are looking for is 2 */
4271             {
4272                 int count = 0;  /* How many characters in a multi-char fold */
4273                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4274                 if (! len) {    /* Not a multi-char fold: get next char */
4275                     s += UTF8SKIP(s);
4276                     continue;
4277                 }
4278
4279                 { /* Here is a generic multi-char fold. */
4280                     U8* multi_end  = s + len;
4281
4282                     /* Count how many characters are in it.  In the case of
4283                      * /aa, no folds which contain ASCII code points are
4284                      * allowed, so check for those, and skip if found. */
4285                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4286                         count = utf8_length(s, multi_end);
4287                         s = multi_end;
4288                     }
4289                     else {
4290                         while (s < multi_end) {
4291                             if (isASCII(*s)) {
4292                                 s++;
4293                                 goto next_iteration;
4294                             }
4295                             else {
4296                                 s += UTF8SKIP(s);
4297                             }
4298                             count++;
4299                         }
4300                     }
4301                 }
4302
4303                 /* The delta is how long the sequence is minus 1 (1 is how long
4304                  * the character that folds to the sequence is) */
4305                 total_count_delta += count - 1;
4306               next_iteration: ;
4307             }
4308
4309             /* We created a temporary folded copy of the string in EXACTFL
4310              * nodes.  Therefore we need to be sure it doesn't go below zero,
4311              * as the real string could be shorter */
4312             if (OP(scan) == EXACTFL) {
4313                 int total_chars = utf8_length((U8*) STRING(scan),
4314                                            (U8*) STRING(scan) + STR_LEN(scan));
4315                 if (total_count_delta > total_chars) {
4316                     total_count_delta = total_chars;
4317                 }
4318             }
4319
4320             *min_subtract += total_count_delta;
4321             Safefree(folded);
4322         }
4323         else if (OP(scan) == EXACTFAA) {
4324
4325             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4326              * fold to the ASCII range (and there are no existing ones in the
4327              * upper latin1 range).  But, as outlined in the comments preceding
4328              * this function, we need to flag any occurrences of the sharp s.
4329              * This character forbids trie formation (because of added
4330              * complexity) */
4331 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4332    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4333                                       || UNICODE_DOT_DOT_VERSION > 0)
4334             while (s < s_end) {
4335                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4336                     OP(scan) = EXACTFAA_NO_TRIE;
4337                     *unfolded_multi_char = TRUE;
4338                     break;
4339                 }
4340                 s++;
4341             }
4342         }
4343         else {
4344
4345             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4346              * folds that are all Latin1.  As explained in the comments
4347              * preceding this function, we look also for the sharp s in EXACTF
4348              * and EXACTFL nodes; it can be in the final position.  Otherwise
4349              * we can stop looking 1 byte earlier because have to find at least
4350              * two characters for a multi-fold */
4351             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4352                               ? s_end
4353                               : s_end -1;
4354
4355             while (s < upper) {
4356                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4357                 if (! len) {    /* Not a multi-char fold. */
4358                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4359                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4360                     {
4361                         *unfolded_multi_char = TRUE;
4362                     }
4363                     s++;
4364                     continue;
4365                 }
4366
4367                 if (len == 2
4368                     && isALPHA_FOLD_EQ(*s, 's')
4369                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4370                 {
4371
4372                     /* EXACTF nodes need to know that the minimum length
4373                      * changed so that a sharp s in the string can match this
4374                      * ss in the pattern, but they remain EXACTF nodes, as they
4375                      * won't match this unless the target string is is UTF-8,
4376                      * which we don't know until runtime.  EXACTFL nodes can't
4377                      * transform into EXACTFU nodes */
4378                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4379                         OP(scan) = EXACTFUP;
4380                     }
4381                 }
4382
4383                 *min_subtract += len - 1;
4384                 s += len;
4385             }
4386 #endif
4387         }
4388
4389         if (     STR_LEN(scan) == 1
4390             &&   isALPHA_A(* STRING(scan))
4391             &&  (         OP(scan) == EXACTFAA
4392                  || (     OP(scan) == EXACTFU
4393                      && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4394         {
4395             U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4396
4397             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4398              * with the mask set to the complement of the bit that differs
4399              * between upper and lower case, and the lowest code point of the
4400              * pair (which the '&' forces) */
4401             OP(scan) = ANYOFM;
4402             ARG_SET(scan, *STRING(scan) & mask);
4403             FLAGS(scan) = mask;
4404         }
4405     }
4406
4407 #ifdef DEBUGGING
4408     /* Allow dumping but overwriting the collection of skipped
4409      * ops and/or strings with fake optimized ops */
4410     n = scan + NODE_SZ_STR(scan);
4411     while (n <= stop) {
4412         OP(n) = OPTIMIZED;
4413         FLAGS(n) = 0;
4414         NEXT_OFF(n) = 0;
4415         n++;
4416     }
4417 #endif
4418     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4419     return stopnow;
4420 }
4421
4422 /* REx optimizer.  Converts nodes into quicker variants "in place".
4423    Finds fixed substrings.  */
4424
4425 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4426    to the position after last scanned or to NULL. */
4427
4428 #define INIT_AND_WITHP \
4429     assert(!and_withp); \
4430     Newx(and_withp, 1, regnode_ssc); \
4431     SAVEFREEPV(and_withp)
4432
4433
4434 static void
4435 S_unwind_scan_frames(pTHX_ const void *p)
4436 {
4437     scan_frame *f= (scan_frame *)p;
4438     do {
4439         scan_frame *n= f->next_frame;
4440         Safefree(f);
4441         f= n;
4442     } while (f);
4443 }
4444
4445 /* the return from this sub is the minimum length that could possibly match */
4446 STATIC SSize_t
4447 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4448                         SSize_t *minlenp, SSize_t *deltap,
4449                         regnode *last,
4450                         scan_data_t *data,
4451                         I32 stopparen,
4452                         U32 recursed_depth,
4453                         regnode_ssc *and_withp,
4454                         U32 flags, U32 depth)
4455                         /* scanp: Start here (read-write). */
4456                         /* deltap: Write maxlen-minlen here. */
4457                         /* last: Stop before this one. */
4458                         /* data: string data about the pattern */
4459                         /* stopparen: treat close N as END */
4460                         /* recursed: which subroutines have we recursed into */
4461                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4462 {
4463     dVAR;
4464     /* There must be at least this number of characters to match */
4465     SSize_t min = 0;
4466     I32 pars = 0, code;
4467     regnode *scan = *scanp, *next;
4468     SSize_t delta = 0;
4469     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4470     int is_inf_internal = 0;            /* The studied chunk is infinite */
4471     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4472     scan_data_t data_fake;
4473     SV *re_trie_maxbuff = NULL;
4474     regnode *first_non_open = scan;
4475     SSize_t stopmin = SSize_t_MAX;
4476     scan_frame *frame = NULL;
4477     GET_RE_DEBUG_FLAGS_DECL;
4478
4479     PERL_ARGS_ASSERT_STUDY_CHUNK;
4480     RExC_study_started= 1;
4481
4482     Zero(&data_fake, 1, scan_data_t);
4483
4484     if ( depth == 0 ) {
4485         while (first_non_open && OP(first_non_open) == OPEN)
4486             first_non_open=regnext(first_non_open);
4487     }
4488
4489
4490   fake_study_recurse:
4491     DEBUG_r(
4492         RExC_study_chunk_recursed_count++;
4493     );
4494     DEBUG_OPTIMISE_MORE_r(
4495     {
4496         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4497             depth, (long)stopparen,
4498             (unsigned long)RExC_study_chunk_recursed_count,
4499             (unsigned long)depth, (unsigned long)recursed_depth,
4500             scan,
4501             last);
4502         if (recursed_depth) {
4503             U32 i;
4504             U32 j;
4505             for ( j = 0 ; j < recursed_depth ; j++ ) {
4506                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4507                     if (
4508                         PAREN_TEST(RExC_study_chunk_recursed +
4509                                    ( j * RExC_study_chunk_recursed_bytes), i )
4510                         && (
4511                             !j ||
4512                             !PAREN_TEST(RExC_study_chunk_recursed +
4513                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4514                         )
4515                     ) {
4516                         Perl_re_printf( aTHX_ " %d",(int)i);
4517                         break;
4518                     }
4519                 }
4520                 if ( j + 1 < recursed_depth ) {
4521                     Perl_re_printf( aTHX_  ",");
4522                 }
4523             }
4524         }
4525         Perl_re_printf( aTHX_ "\n");
4526     }
4527     );
4528     while ( scan && OP(scan) != END && scan < last ){
4529         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4530                                    node length to get a real minimum (because
4531                                    the folded version may be shorter) */
4532         bool unfolded_multi_char = FALSE;
4533         /* Peephole optimizer: */
4534         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4535         DEBUG_PEEP("Peep", scan, depth, flags);
4536
4537
4538         /* The reason we do this here is that we need to deal with things like
4539          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4540          * parsing code, as each (?:..) is handled by a different invocation of
4541          * reg() -- Yves
4542          */
4543         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4544
4545         /* Follow the next-chain of the current node and optimize
4546            away all the NOTHINGs from it.  */
4547         if (OP(scan) != CURLYX) {
4548             const int max = (reg_off_by_arg[OP(scan)]
4549                        ? I32_MAX
4550                        /* I32 may be smaller than U16 on CRAYs! */
4551                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4552             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4553             int noff;
4554             regnode *n = scan;
4555
4556             /* Skip NOTHING and LONGJMP. */
4557             while ((n = regnext(n))
4558                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4559                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4560                    && off + noff < max)
4561                 off += noff;
4562             if (reg_off_by_arg[OP(scan)])
4563                 ARG(scan) = off;
4564             else
4565                 NEXT_OFF(scan) = off;
4566         }
4567
4568         /* The principal pseudo-switch.  Cannot be a switch, since we
4569            look into several different things.  */
4570         if ( OP(scan) == DEFINEP ) {
4571             SSize_t minlen = 0;
4572             SSize_t deltanext = 0;
4573             SSize_t fake_last_close = 0;
4574             I32 f = SCF_IN_DEFINE;
4575
4576             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4577             scan = regnext(scan);
4578             assert( OP(scan) == IFTHEN );
4579             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4580
4581             data_fake.last_closep= &fake_last_close;
4582             minlen = *minlenp;
4583             next = regnext(scan);
4584             scan = NEXTOPER(NEXTOPER(scan));
4585             DEBUG_PEEP("scan", scan, depth, flags);
4586             DEBUG_PEEP("next", next, depth, flags);
4587
4588             /* we suppose the run is continuous, last=next...
4589              * NOTE we dont use the return here! */
4590             /* DEFINEP study_chunk() recursion */
4591             (void)study_chunk(pRExC_state, &scan, &minlen,
4592                               &deltanext, next, &data_fake, stopparen,
4593                               recursed_depth, NULL, f, depth+1);
4594
4595             scan = next;
4596         } else
4597         if (
4598             OP(scan) == BRANCH  ||
4599             OP(scan) == BRANCHJ ||
4600             OP(scan) == IFTHEN
4601         ) {
4602             next = regnext(scan);
4603             code = OP(scan);
4604
4605             /* The op(next)==code check below is to see if we
4606              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4607              * IFTHEN is special as it might not appear in pairs.
4608              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4609              * we dont handle it cleanly. */
4610             if (OP(next) == code || code == IFTHEN) {
4611                 /* NOTE - There is similar code to this block below for
4612                  * handling TRIE nodes on a re-study.  If you change stuff here
4613                  * check there too. */
4614                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4615                 regnode_ssc accum;
4616                 regnode * const startbranch=scan;
4617
4618                 if (flags & SCF_DO_SUBSTR) {
4619                     /* Cannot merge strings after this. */
4620                     scan_commit(pRExC_state, data, minlenp, is_inf);
4621                 }
4622
4623                 if (flags & SCF_DO_STCLASS)
4624                     ssc_init_zero(pRExC_state, &accum);
4625
4626                 while (OP(scan) == code) {
4627                     SSize_t deltanext, minnext, fake;
4628                     I32 f = 0;
4629                     regnode_ssc this_class;
4630
4631                     DEBUG_PEEP("Branch", scan, depth, flags);
4632
4633                     num++;
4634                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4635                     if (data) {
4636                         data_fake.whilem_c = data->whilem_c;
4637                         data_fake.last_closep = data->last_closep;
4638                     }
4639                     else
4640                         data_fake.last_closep = &fake;
4641
4642                     data_fake.pos_delta = delta;
4643                     next = regnext(scan);
4644
4645                     scan = NEXTOPER(scan); /* everything */
4646                     if (code != BRANCH)    /* everything but BRANCH */
4647                         scan = NEXTOPER(scan);
4648
4649                     if (flags & SCF_DO_STCLASS) {
4650                         ssc_init(pRExC_state, &this_class);
4651                         data_fake.start_class = &this_class;
4652                         f = SCF_DO_STCLASS_AND;
4653                     }
4654                     if (flags & SCF_WHILEM_VISITED_POS)
4655                         f |= SCF_WHILEM_VISITED_POS;
4656
4657                     /* we suppose the run is continuous, last=next...*/
4658                     /* recurse study_chunk() for each BRANCH in an alternation */
4659                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4660                                       &deltanext, next, &data_fake, stopparen,
4661                                       recursed_depth, NULL, f, depth+1);
4662
4663                     if (min1 > minnext)
4664                         min1 = minnext;
4665                     if (deltanext == SSize_t_MAX) {
4666                         is_inf = is_inf_internal = 1;
4667                         max1 = SSize_t_MAX;
4668                     } else if (max1 < minnext + deltanext)
4669                         max1 = minnext + deltanext;
4670                     scan = next;
4671                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4672                         pars++;
4673                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4674                         if ( stopmin > minnext)
4675                             stopmin = min + min1;
4676                         flags &= ~SCF_DO_SUBSTR;
4677                         if (data)
4678                             data->flags |= SCF_SEEN_ACCEPT;
4679                     }
4680                     if (data) {
4681                         if (data_fake.flags & SF_HAS_EVAL)
4682                             data->flags |= SF_HAS_EVAL;
4683                         data->whilem_c = data_fake.whilem_c;
4684                     }
4685                     if (flags & SCF_DO_STCLASS)
4686                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4687                 }
4688                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4689                     min1 = 0;
4690                 if (flags & SCF_DO_SUBSTR) {
4691                     data->pos_min += min1;
4692                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4693                         data->pos_delta = SSize_t_MAX;
4694                     else
4695                         data->pos_delta += max1 - min1;
4696                     if (max1 != min1 || is_inf)
4697                         data->cur_is_floating = 1;
4698                 }
4699                 min += min1;
4700                 if (delta == SSize_t_MAX
4701                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4702                     delta = SSize_t_MAX;
4703                 else
4704                     delta += max1 - min1;
4705                 if (flags & SCF_DO_STCLASS_OR) {
4706                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4707                     if (min1) {
4708                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4709                         flags &= ~SCF_DO_STCLASS;
4710                     }
4711                 }
4712                 else if (flags & SCF_DO_STCLASS_AND) {
4713                     if (min1) {
4714                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4715                         flags &= ~SCF_DO_STCLASS;
4716                     }
4717                     else {
4718                         /* Switch to OR mode: cache the old value of
4719                          * data->start_class */
4720                         INIT_AND_WITHP;
4721                         StructCopy(data->start_class, and_withp, regnode_ssc);
4722                         flags &= ~SCF_DO_STCLASS_AND;
4723                         StructCopy(&accum, data->start_class, regnode_ssc);
4724                         flags |= SCF_DO_STCLASS_OR;
4725                     }
4726                 }
4727
4728                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4729                         OP( startbranch ) == BRANCH )
4730                 {
4731                 /* demq.
4732
4733                    Assuming this was/is a branch we are dealing with: 'scan'
4734                    now points at the item that follows the branch sequence,
4735                    whatever it is. We now start at the beginning of the
4736                    sequence and look for subsequences of
4737
4738                    BRANCH->EXACT=>x1
4739                    BRANCH->EXACT=>x2
4740                    tail
4741
4742                    which would be constructed from a pattern like
4743                    /A|LIST|OF|WORDS/
4744
4745                    If we can find such a subsequence we need to turn the first
4746                    element into a trie and then add the subsequent branch exact
4747                    strings to the trie.
4748
4749                    We have two cases
4750
4751                      1. patterns where the whole set of branches can be
4752                         converted.
4753
4754                      2. patterns where only a subset can be converted.
4755
4756                    In case 1 we can replace the whole set with a single regop
4757                    for the trie. In case 2 we need to keep the start and end
4758                    branches so
4759
4760                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4761                      becomes BRANCH TRIE; BRANCH X;
4762
4763                   There is an additional case, that being where there is a
4764                   common prefix, which gets split out into an EXACT like node
4765                   preceding the TRIE node.
4766
4767                   If x(1..n)==tail then we can do a simple trie, if not we make
4768                   a "jump" trie, such that when we match the appropriate word
4769                   we "jump" to the appropriate tail node. Essentially we turn
4770                   a nested if into a case structure of sorts.
4771
4772                 */
4773
4774                     int made=0;
4775                     if (!re_trie_maxbuff) {
4776                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4777                         if (!SvIOK(re_trie_maxbuff))
4778                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4779                     }
4780                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4781                         regnode *cur;
4782                         regnode *first = (regnode *)NULL;
4783                         regnode *last = (regnode *)NULL;
4784                         regnode *tail = scan;
4785                         U8 trietype = 0;
4786                         U32 count=0;
4787
4788                         /* var tail is used because there may be a TAIL
4789                            regop in the way. Ie, the exacts will point to the
4790                            thing following the TAIL, but the last branch will
4791                            point at the TAIL. So we advance tail. If we
4792                            have nested (?:) we may have to move through several
4793                            tails.
4794                          */
4795
4796                         while ( OP( tail ) == TAIL ) {
4797                             /* this is the TAIL generated by (?:) */
4798                             tail = regnext( tail );
4799                         }
4800
4801
4802                         DEBUG_TRIE_COMPILE_r({
4803                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4804                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4805                               depth+1,
4806                               "Looking for TRIE'able sequences. Tail node is ",
4807                               (UV) REGNODE_OFFSET(tail),
4808                               SvPV_nolen_const( RExC_mysv )
4809                             );
4810                         });
4811
4812                         /*
4813
4814                             Step through the branches
4815                                 cur represents each branch,
4816                                 noper is the first thing to be matched as part
4817                                       of that branch
4818                                 noper_next is the regnext() of that node.
4819
4820                             We normally handle a case like this
4821                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4822                             support building with NOJUMPTRIE, which restricts
4823                             the trie logic to structures like /FOO|BAR/.
4824
4825                             If noper is a trieable nodetype then the branch is
4826                             a possible optimization target. If we are building
4827                             under NOJUMPTRIE then we require that noper_next is
4828                             the same as scan (our current position in the regex
4829                             program).
4830
4831                             Once we have two or more consecutive such branches
4832                             we can create a trie of the EXACT's contents and
4833                             stitch it in place into the program.
4834
4835                             If the sequence represents all of the branches in
4836                             the alternation we replace the entire thing with a
4837                             single TRIE node.
4838
4839                             Otherwise when it is a subsequence we need to
4840                             stitch it in place and replace only the relevant
4841                             branches. This means the first branch has to remain
4842                             as it is used by the alternation logic, and its
4843                             next pointer, and needs to be repointed at the item
4844                             on the branch chain following the last branch we
4845                             have optimized away.
4846
4847                             This could be either a BRANCH, in which case the
4848                             subsequence is internal, or it could be the item
4849                             following the branch sequence in which case the
4850                             subsequence is at the end (which does not
4851                             necessarily mean the first node is the start of the
4852                             alternation).
4853
4854                             TRIE_TYPE(X) is a define which maps the optype to a
4855                             trietype.
4856
4857                                 optype          |  trietype
4858                                 ----------------+-----------
4859                                 NOTHING         | NOTHING
4860                                 EXACT           | EXACT
4861                                 EXACT_ONLY8     | EXACT
4862                                 EXACTFU         | EXACTFU
4863                                 EXACTFU_ONLY8   | EXACTFU
4864                                 EXACTFUP        | EXACTFU
4865                                 EXACTFAA        | EXACTFAA
4866                                 EXACTL          | EXACTL
4867                                 EXACTFLU8       | EXACTFLU8
4868
4869
4870                         */
4871 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4872                        ? NOTHING                                            \
4873                        : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
4874                          ? EXACT                                            \
4875                          : (     EXACTFU == (X)                             \
4876                               || EXACTFU_ONLY8 == (X)                       \
4877                               || EXACTFUP == (X) )                          \
4878                            ? EXACTFU                                        \
4879                            : ( EXACTFAA == (X) )                            \
4880                              ? EXACTFAA                                     \
4881                              : ( EXACTL == (X) )                            \
4882                                ? EXACTL                                     \
4883                                : ( EXACTFLU8 == (X) )                       \
4884                                  ? EXACTFLU8                                \
4885                                  : 0 )
4886
4887                         /* dont use tail as the end marker for this traverse */
4888                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4889                             regnode * const noper = NEXTOPER( cur );
4890                             U8 noper_type = OP( noper );
4891                             U8 noper_trietype = TRIE_TYPE( noper_type );
4892 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4893                             regnode * const noper_next = regnext( noper );
4894                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4895                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4896 #endif
4897
4898                             DEBUG_TRIE_COMPILE_r({
4899                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4900                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4901                                    depth+1,
4902                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4903
4904                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4905                                 Perl_re_printf( aTHX_  " -> %d:%s",
4906                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4907
4908                                 if ( noper_next ) {
4909                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4910                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4911                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4912                                 }
4913                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4914                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4915                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4916                                 );
4917                             });
4918
4919                             /* Is noper a trieable nodetype that can be merged
4920                              * with the current trie (if there is one)? */
4921                             if ( noper_trietype
4922                                   &&
4923                                   (
4924                                         ( noper_trietype == NOTHING )
4925                                         || ( trietype == NOTHING )
4926                                         || ( trietype == noper_trietype )
4927                                   )
4928 #ifdef NOJUMPTRIE
4929                                   && noper_next >= tail
4930 #endif
4931                                   && count < U16_MAX)
4932                             {
4933                                 /* Handle mergable triable node Either we are
4934                                  * the first node in a new trieable sequence,
4935                                  * in which case we do some bookkeeping,
4936                                  * otherwise we update the end pointer. */
4937                                 if ( !first ) {
4938                                     first = cur;
4939                                     if ( noper_trietype == NOTHING ) {
4940 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4941                                         regnode * const noper_next = regnext( noper );
4942                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4943                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4944 #endif
4945
4946                                         if ( noper_next_trietype ) {
4947                                             trietype = noper_next_trietype;
4948                                         } else if (noper_next_type)  {
4949                                             /* a NOTHING regop is 1 regop wide.
4950                                              * We need at least two for a trie
4951                                              * so we can't merge this in */
4952                                             first = NULL;
4953                                         }
4954                                     } else {
4955                                         trietype = noper_trietype;
4956                                     }
4957                                 } else {
4958                                     if ( trietype == NOTHING )
4959                                         trietype = noper_trietype;
4960                                     last = cur;
4961                                 }
4962                                 if (first)
4963                                     count++;
4964                             } /* end handle mergable triable node */
4965                             else {
4966                                 /* handle unmergable node -
4967                                  * noper may either be a triable node which can
4968                                  * not be tried together with the current trie,
4969                                  * or a non triable node */
4970                                 if ( last ) {
4971                                     /* If last is set and trietype is not
4972                                      * NOTHING then we have found at least two
4973                                      * triable branch sequences in a row of a
4974                                      * similar trietype so we can turn them
4975                                      * into a trie. If/when we allow NOTHING to
4976                                      * start a trie sequence this condition
4977                                      * will be required, and it isn't expensive
4978                                      * so we leave it in for now. */
4979                                     if ( trietype && trietype != NOTHING )
4980                                         make_trie( pRExC_state,
4981                                                 startbranch, first, cur, tail,
4982                                                 count, trietype, depth+1 );
4983                                     last = NULL; /* note: we clear/update
4984                                                     first, trietype etc below,
4985                                                     so we dont do it here */
4986                                 }
4987                                 if ( noper_trietype
4988 #ifdef NOJUMPTRIE
4989                                      && noper_next >= tail
4990 #endif
4991                                 ){
4992                                     /* noper is triable, so we can start a new
4993                                      * trie sequence */
4994                                     count = 1;
4995                                     first = cur;
4996                                     trietype = noper_trietype;
4997                                 } else if (first) {
4998                                     /* if we already saw a first but the
4999                                      * current node is not triable then we have
5000                                      * to reset the first information. */
5001                                     count = 0;
5002                                     first = NULL;
5003                                     trietype = 0;
5004                                 }
5005                             } /* end handle unmergable node */
5006                         } /* loop over branches */
5007                         DEBUG_TRIE_COMPILE_r({
5008                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5009                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5010                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5011                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5012                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
5013                                PL_reg_name[trietype]
5014                             );
5015
5016                         });
5017                         if ( last && trietype ) {
5018                             if ( trietype != NOTHING ) {
5019                                 /* the last branch of the sequence was part of
5020                                  * a trie, so we have to construct it here
5021                                  * outside of the loop */
5022                                 made= make_trie( pRExC_state, startbranch,
5023                                                  first, scan, tail, count,
5024                                                  trietype, depth+1 );
5025 #ifdef TRIE_STUDY_OPT
5026                                 if ( ((made == MADE_EXACT_TRIE &&
5027                                      startbranch == first)
5028                                      || ( first_non_open == first )) &&
5029                                      depth==0 ) {
5030                                     flags |= SCF_TRIE_RESTUDY;
5031                                     if ( startbranch == first
5032                                          && scan >= tail )
5033                                     {
5034                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5035                                     }
5036                                 }
5037 #endif
5038                             } else {
5039                                 /* at this point we know whatever we have is a
5040                                  * NOTHING sequence/branch AND if 'startbranch'
5041                                  * is 'first' then we can turn the whole thing
5042                                  * into a NOTHING
5043                                  */
5044                                 if ( startbranch == first ) {
5045                                     regnode *opt;
5046                                     /* the entire thing is a NOTHING sequence,
5047                                      * something like this: (?:|) So we can
5048                                      * turn it into a plain NOTHING op. */
5049                                     DEBUG_TRIE_COMPILE_r({
5050                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5051                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5052                                           depth+1,
5053                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5054
5055                                     });
5056                                     OP(startbranch)= NOTHING;
5057                                     NEXT_OFF(startbranch)= tail - startbranch;
5058                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5059                                         OP(opt)= OPTIMIZED;
5060                                 }
5061                             }
5062                         } /* end if ( last) */
5063                     } /* TRIE_MAXBUF is non zero */
5064
5065                 } /* do trie */
5066
5067             }
5068             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5069                 scan = NEXTOPER(NEXTOPER(scan));
5070             } else                      /* single branch is optimized. */
5071                 scan = NEXTOPER(scan);
5072             continue;
5073         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5074             I32 paren = 0;
5075             regnode *start = NULL;
5076             regnode *end = NULL;
5077             U32 my_recursed_depth= recursed_depth;
5078
5079             if (OP(scan) != SUSPEND) { /* GOSUB */
5080                 /* Do setup, note this code has side effects beyond
5081                  * the rest of this block. Specifically setting
5082                  * RExC_recurse[] must happen at least once during
5083                  * study_chunk(). */
5084                 paren = ARG(scan);
5085                 RExC_recurse[ARG2L(scan)] = scan;
5086                 start = REGNODE_p(RExC_open_parens[paren]);
5087                 end   = REGNODE_p(RExC_close_parens[paren]);
5088
5089                 /* NOTE we MUST always execute the above code, even
5090                  * if we do nothing with a GOSUB */
5091                 if (
5092                     ( flags & SCF_IN_DEFINE )
5093                     ||
5094                     (
5095                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5096                         &&
5097                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5098                     )
5099                 ) {
5100                     /* no need to do anything here if we are in a define. */
5101                     /* or we are after some kind of infinite construct
5102                      * so we can skip recursing into this item.
5103                      * Since it is infinite we will not change the maxlen
5104                      * or delta, and if we miss something that might raise
5105                      * the minlen it will merely pessimise a little.
5106                      *
5107                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5108                      * might result in a minlen of 1 and not of 4,
5109                      * but this doesn't make us mismatch, just try a bit
5110                      * harder than we should.
5111                      * */
5112                     scan= regnext(scan);
5113                     continue;
5114                 }
5115
5116                 if (
5117                     !recursed_depth
5118                     ||
5119                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5120                 ) {
5121                     /* it is quite possible that there are more efficient ways
5122                      * to do this. We maintain a bitmap per level of recursion
5123                      * of which patterns we have entered so we can detect if a
5124                      * pattern creates a possible infinite loop. When we
5125                      * recurse down a level we copy the previous levels bitmap
5126                      * down. When we are at recursion level 0 we zero the top
5127                      * level bitmap. It would be nice to implement a different
5128                      * more efficient way of doing this. In particular the top
5129                      * level bitmap may be unnecessary.
5130                      */
5131                     if (!recursed_depth) {
5132                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5133                     } else {
5134                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5135                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5136                              RExC_study_chunk_recursed_bytes, U8);
5137                     }
5138                     /* we havent recursed into this paren yet, so recurse into it */
5139                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5140                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5141                     my_recursed_depth= recursed_depth + 1;
5142                 } else {
5143                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5144                     /* some form of infinite recursion, assume infinite length
5145                      * */
5146                     if (flags & SCF_DO_SUBSTR) {
5147                         scan_commit(pRExC_state, data, minlenp, is_inf);
5148                         data->cur_is_floating = 1;
5149                     }
5150                     is_inf = is_inf_internal = 1;
5151                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5152                         ssc_anything(data->start_class);
5153                     flags &= ~SCF_DO_STCLASS;
5154
5155                     start= NULL; /* reset start so we dont recurse later on. */
5156                 }
5157             } else {
5158                 paren = stopparen;
5159                 start = scan + 2;
5160                 end = regnext(scan);
5161             }
5162             if (start) {
5163                 scan_frame *newframe;
5164                 assert(end);
5165                 if (!RExC_frame_last) {
5166                     Newxz(newframe, 1, scan_frame);
5167                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5168                     RExC_frame_head= newframe;
5169                     RExC_frame_count++;
5170                 } else if (!RExC_frame_last->next_frame) {
5171                     Newxz(newframe, 1, scan_frame);
5172                     RExC_frame_last->next_frame= newframe;
5173                     newframe->prev_frame= RExC_frame_last;
5174                     RExC_frame_count++;
5175                 } else {
5176                     newframe= RExC_frame_last->next_frame;
5177                 }
5178                 RExC_frame_last= newframe;
5179
5180                 newframe->next_regnode = regnext(scan);
5181                 newframe->last_regnode = last;
5182                 newframe->stopparen = stopparen;
5183                 newframe->prev_recursed_depth = recursed_depth;
5184                 newframe->this_prev_frame= frame;
5185
5186                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5187                 DEBUG_PEEP("fnew", scan, depth, flags);
5188
5189                 frame = newframe;
5190                 scan =  start;
5191                 stopparen = paren;
5192                 last = end;
5193                 depth = depth + 1;
5194                 recursed_depth= my_recursed_depth;
5195
5196                 continue;
5197             }
5198         }
5199         else if (   OP(scan) == EXACT
5200                  || OP(scan) == EXACT_ONLY8
5201                  || OP(scan) == EXACTL)
5202         {
5203             SSize_t l = STR_LEN(scan);
5204             UV uc;
5205             assert(l);
5206             if (UTF) {
5207                 const U8 * const s = (U8*)STRING(scan);
5208                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5209                 l = utf8_length(s, s + l);
5210             } else {
5211                 uc = *((U8*)STRING(scan));
5212             }
5213             min += l;
5214             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5215                 /* The code below prefers earlier match for fixed
5216                    offset, later match for variable offset.  */
5217                 if (data->last_end == -1) { /* Update the start info. */
5218                     data->last_start_min = data->pos_min;
5219                     data->last_start_max = is_inf
5220                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
5221                 }
5222                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5223                 if (UTF)
5224                     SvUTF8_on(data->last_found);
5225                 {
5226                     SV * const sv = data->last_found;
5227                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5228                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5229                     if (mg && mg->mg_len >= 0)
5230                         mg->mg_len += utf8_length((U8*)STRING(scan),
5231                                               (U8*)STRING(scan)+STR_LEN(scan));
5232                 }
5233                 data->last_end = data->pos_min + l;
5234                 data->pos_min += l; /* As in the first entry. */
5235                 data->flags &= ~SF_BEFORE_EOL;
5236             }
5237
5238             /* ANDing the code point leaves at most it, and not in locale, and
5239              * can't match null string */
5240             if (flags & SCF_DO_STCLASS_AND) {
5241                 ssc_cp_and(data->start_class, uc);
5242                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5243                 ssc_clear_locale(data->start_class);
5244             }
5245             else if (flags & SCF_DO_STCLASS_OR) {
5246                 ssc_add_cp(data->start_class, uc);
5247                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5248
5249                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5250                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5251             }
5252             flags &= ~SCF_DO_STCLASS;
5253         }
5254         else if (PL_regkind[OP(scan)] == EXACT) {
5255             /* But OP != EXACT!, so is EXACTFish */
5256             SSize_t l = STR_LEN(scan);
5257             const U8 * s = (U8*)STRING(scan);
5258
5259             /* Search for fixed substrings supports EXACT only. */
5260             if (flags & SCF_DO_SUBSTR) {
5261                 assert(data);
5262                 scan_commit(pRExC_state, data, minlenp, is_inf);
5263             }
5264             if (UTF) {
5265                 l = utf8_length(s, s + l);
5266             }
5267             if (unfolded_multi_char) {
5268                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5269             }
5270             min += l - min_subtract;
5271             assert (min >= 0);
5272             delta += min_subtract;
5273             if (flags & SCF_DO_SUBSTR) {
5274                 data->pos_min += l - min_subtract;
5275                 if (data->pos_min < 0) {
5276                     data->pos_min = 0;
5277                 }
5278                 data->pos_delta += min_subtract;
5279                 if (min_subtract) {
5280                     data->cur_is_floating = 1; /* float */
5281                 }
5282             }
5283
5284             if (flags & SCF_DO_STCLASS) {
5285                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
5286
5287                 assert(EXACTF_invlist);
5288                 if (flags & SCF_DO_STCLASS_AND) {
5289                     if (OP(scan) != EXACTFL)
5290                         ssc_clear_locale(data->start_class);
5291                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5292                     ANYOF_POSIXL_ZERO(data->start_class);
5293                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5294                 }
5295                 else {  /* SCF_DO_STCLASS_OR */
5296                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5297                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5298
5299                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5300                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5301                 }
5302                 flags &= ~SCF_DO_STCLASS;
5303                 SvREFCNT_dec(EXACTF_invlist);
5304             }
5305         }
5306         else if (REGNODE_VARIES(OP(scan))) {
5307             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5308             I32 fl = 0, f = flags;
5309             regnode * const oscan = scan;
5310             regnode_ssc this_class;
5311             regnode_ssc *oclass = NULL;
5312             I32 next_is_eval = 0;
5313
5314             switch (PL_regkind[OP(scan)]) {
5315             case WHILEM:                /* End of (?:...)* . */
5316                 scan = NEXTOPER(scan);
5317                 goto finish;
5318             case PLUS:
5319                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5320                     next = NEXTOPER(scan);
5321                     if (   OP(next) == EXACT
5322                         || OP(next) == EXACT_ONLY8
5323                         || OP(next) == EXACTL
5324                         || (flags & SCF_DO_STCLASS))
5325                     {
5326                         mincount = 1;
5327                         maxcount = REG_INFTY;
5328                         next = regnext(scan);
5329                         scan = NEXTOPER(scan);
5330                         goto do_curly;
5331                     }
5332                 }
5333                 if (flags & SCF_DO_SUBSTR)
5334                     data->pos_min++;
5335                 min++;
5336                 /* FALLTHROUGH */
5337             case STAR:
5338                 next = NEXTOPER(scan);
5339
5340                 /* This temporary node can now be turned into EXACTFU, and
5341                  * must, as regexec.c doesn't handle it */
5342                 if (OP(next) == EXACTFU_S_EDGE) {
5343                     OP(next) = EXACTFU;
5344                 }
5345
5346                 if (     STR_LEN(next) == 1
5347                     &&   isALPHA_A(* STRING(next))
5348                     && (         OP(next) == EXACTFAA
5349                         || (     OP(next) == EXACTFU
5350                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
5351                 {
5352                     /* These differ in just one bit */
5353                     U8 mask = ~ ('A' ^ 'a');
5354
5355                     assert(isALPHA_A(* STRING(next)));
5356
5357                     /* Then replace it by an ANYOFM node, with
5358                     * the mask set to the complement of the
5359                     * bit that differs between upper and lower
5360                     * case, and the lowest code point of the
5361                     * pair (which the '&' forces) */
5362                     OP(next) = ANYOFM;
5363                     ARG_SET(next, *STRING(next) & mask);
5364                     FLAGS(next) = mask;
5365                 }
5366
5367                 if (flags & SCF_DO_STCLASS) {
5368                     mincount = 0;
5369                     maxcount = REG_INFTY;
5370                     next = regnext(scan);
5371                     scan = NEXTOPER(scan);
5372                     goto do_curly;
5373                 }
5374                 if (flags & SCF_DO_SUBSTR) {
5375                     scan_commit(pRExC_state, data, minlenp, is_inf);
5376                     /* Cannot extend fixed substrings */
5377                     data->cur_is_floating = 1; /* float */
5378                 }
5379                 is_inf = is_inf_internal = 1;
5380                 scan = regnext(scan);
5381                 goto optimize_curly_tail;
5382             case CURLY:
5383                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5384                     && (scan->flags == stopparen))
5385                 {
5386                     mincount = 1;
5387                     maxcount = 1;
5388                 } else {
5389                     mincount = ARG1(scan);
5390                     maxcount = ARG2(scan);
5391                 }
5392                 next = regnext(scan);
5393                 if (OP(scan) == CURLYX) {
5394                     I32 lp = (data ? *(data->last_closep) : 0);
5395                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5396                 }
5397                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5398                 next_is_eval = (OP(scan) == EVAL);
5399               do_curly:
5400                 if (flags & SCF_DO_SUBSTR) {
5401                     if (mincount == 0)
5402                         scan_commit(pRExC_state, data, minlenp, is_inf);
5403                     /* Cannot extend fixed substrings */
5404                     pos_before = data->pos_min;
5405                 }
5406                 if (data) {
5407                     fl = data->flags;
5408                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5409                     if (is_inf)
5410                         data->flags |= SF_IS_INF;
5411                 }
5412                 if (flags & SCF_DO_STCLASS) {
5413                     ssc_init(pRExC_state, &this_class);
5414                     oclass = data->start_class;
5415                     data->start_class = &this_class;
5416                     f |= SCF_DO_STCLASS_AND;
5417                     f &= ~SCF_DO_STCLASS_OR;
5418                 }
5419                 /* Exclude from super-linear cache processing any {n,m}
5420                    regops for which the combination of input pos and regex
5421                    pos is not enough information to determine if a match
5422                    will be possible.
5423
5424                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5425                    regex pos at the \s*, the prospects for a match depend not
5426                    only on the input position but also on how many (bar\s*)
5427                    repeats into the {4,8} we are. */
5428                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5429                     f &= ~SCF_WHILEM_VISITED_POS;
5430
5431                 /* This will finish on WHILEM, setting scan, or on NULL: */
5432                 /* recurse study_chunk() on loop bodies */
5433                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5434                                   last, data, stopparen, recursed_depth, NULL,
5435                                   (mincount == 0
5436                                    ? (f & ~SCF_DO_SUBSTR)
5437                                    : f)
5438                                   ,depth+1);
5439
5440                 if (flags & SCF_DO_STCLASS)
5441                     data->start_class = oclass;
5442                 if (mincount == 0 || minnext == 0) {
5443                     if (flags & SCF_DO_STCLASS_OR) {
5444                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5445                     }
5446                     else if (flags & SCF_DO_STCLASS_AND) {
5447                         /* Switch to OR mode: cache the old value of
5448                          * data->start_class */
5449                         INIT_AND_WITHP;
5450                         StructCopy(data->start_class, and_withp, regnode_ssc);
5451                         flags &= ~SCF_DO_STCLASS_AND;
5452                         StructCopy(&this_class, data->start_class, regnode_ssc);
5453                         flags |= SCF_DO_STCLASS_OR;
5454                         ANYOF_FLAGS(data->start_class)
5455                                                 |= SSC_MATCHES_EMPTY_STRING;
5456                     }
5457                 } else {                /* Non-zero len */
5458                     if (flags & SCF_DO_STCLASS_OR) {
5459                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5460                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5461                     }
5462                     else if (flags & SCF_DO_STCLASS_AND)
5463                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5464                     flags &= ~SCF_DO_STCLASS;
5465                 }
5466                 if (!scan)              /* It was not CURLYX, but CURLY. */
5467                     scan = next;
5468                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5469                     /* ? quantifier ok, except for (?{ ... }) */
5470                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5471                     && (minnext == 0) && (deltanext == 0)
5472                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5473                     && maxcount <= REG_INFTY/3) /* Complement check for big
5474                                                    count */
5475                 {
5476                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5477                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5478                             "Quantifier unexpected on zero-length expression "
5479                             "in regex m/%" UTF8f "/",
5480                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5481                                   RExC_precomp)));
5482                 }
5483
5484                 min += minnext * mincount;
5485                 is_inf_internal |= deltanext == SSize_t_MAX
5486                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5487                 is_inf |= is_inf_internal;
5488                 if (is_inf) {
5489                     delta = SSize_t_MAX;
5490                 } else {
5491                     delta += (minnext + deltanext) * maxcount
5492                              - minnext * mincount;
5493                 }
5494                 /* Try powerful optimization CURLYX => CURLYN. */
5495                 if (  OP(oscan) == CURLYX && data
5496                       && data->flags & SF_IN_PAR
5497                       && !(data->flags & SF_HAS_EVAL)
5498                       && !deltanext && minnext == 1 ) {
5499                     /* Try to optimize to CURLYN.  */
5500                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5501                     regnode * const nxt1 = nxt;
5502 #ifdef DEBUGGING
5503                     regnode *nxt2;
5504 #endif
5505
5506                     /* Skip open. */
5507                     nxt = regnext(nxt);
5508                     if (!REGNODE_SIMPLE(OP(nxt))
5509                         && !(PL_regkind[OP(nxt)] == EXACT
5510                              && STR_LEN(nxt) == 1))
5511                         goto nogo;
5512 #ifdef DEBUGGING
5513                     nxt2 = nxt;
5514 #endif
5515                     nxt = regnext(nxt);
5516                     if (OP(nxt) != CLOSE)
5517                         goto nogo;
5518                     if (RExC_open_parens) {
5519
5520                         /*open->CURLYM*/
5521                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5522
5523                         /*close->while*/
5524                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5525                     }
5526                     /* Now we know that nxt2 is the only contents: */
5527                     oscan->flags = (U8)ARG(nxt);
5528                     OP(oscan) = CURLYN;
5529                     OP(nxt1) = NOTHING; /* was OPEN. */
5530
5531 #ifdef DEBUGGING
5532                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5533                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5534                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5535                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5536                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5537                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5538 #endif
5539                 }
5540               nogo:
5541
5542                 /* Try optimization CURLYX => CURLYM. */
5543                 if (  OP(oscan) == CURLYX && data
5544                       && !(data->flags & SF_HAS_PAR)
5545                       && !(data->flags & SF_HAS_EVAL)
5546                       && !deltanext     /* atom is fixed width */
5547                       && minnext != 0   /* CURLYM can't handle zero width */
5548
5549                          /* Nor characters whose fold at run-time may be
5550                           * multi-character */
5551                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5552                 ) {
5553                     /* XXXX How to optimize if data == 0? */
5554                     /* Optimize to a simpler form.  */
5555                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5556                     regnode *nxt2;
5557
5558                     OP(oscan) = CURLYM;
5559                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5560                             && (OP(nxt2) != WHILEM))
5561                         nxt = nxt2;
5562                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5563                     /* Need to optimize away parenths. */
5564                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5565                         /* Set the parenth number.  */
5566                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5567
5568                         oscan->flags = (U8)ARG(nxt);
5569                         if (RExC_open_parens) {
5570                              /*open->CURLYM*/
5571                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5572
5573                             /*close->NOTHING*/
5574                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5575                                                          + 1;
5576                         }
5577                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5578                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5579
5580 #ifdef DEBUGGING
5581                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5582                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5583                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5584                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5585 #endif
5586 #if 0
5587                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5588                             regnode *nnxt = regnext(nxt1);
5589                             if (nnxt == nxt) {
5590                                 if (reg_off_by_arg[OP(nxt1)])
5591                                     ARG_SET(nxt1, nxt2 - nxt1);
5592                                 else if (nxt2 - nxt1 < U16_MAX)
5593                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5594                                 else
5595                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5596                             }
5597                             nxt1 = nnxt;
5598                         }
5599 #endif
5600                         /* Optimize again: */
5601                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5602                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5603                                     NULL, stopparen, recursed_depth, NULL, 0,
5604                                     depth+1);
5605                     }
5606                     else
5607                         oscan->flags = 0;
5608                 }
5609                 else if ((OP(oscan) == CURLYX)
5610                          && (flags & SCF_WHILEM_VISITED_POS)
5611                          /* See the comment on a similar expression above.
5612                             However, this time it's not a subexpression
5613                             we care about, but the expression itself. */
5614                          && (maxcount == REG_INFTY)
5615                          && data) {
5616                     /* This stays as CURLYX, we can put the count/of pair. */
5617                     /* Find WHILEM (as in regexec.c) */
5618                     regnode *nxt = oscan + NEXT_OFF(oscan);
5619
5620                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5621                         nxt += ARG(nxt);
5622                     nxt = PREVOPER(nxt);
5623                     if (nxt->flags & 0xf) {
5624                         /* we've already set whilem count on this node */
5625                     } else if (++data->whilem_c < 16) {
5626                         assert(data->whilem_c <= RExC_whilem_seen);
5627                         nxt->flags = (U8)(data->whilem_c
5628                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5629                     }
5630                 }
5631                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5632                     pars++;
5633                 if (flags & SCF_DO_SUBSTR) {
5634                     SV *last_str = NULL;
5635                     STRLEN last_chrs = 0;
5636                     int counted = mincount != 0;
5637
5638                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5639                                                                   string. */
5640                         SSize_t b = pos_before >= data->last_start_min
5641                             ? pos_before : data->last_start_min;
5642                         STRLEN l;
5643                         const char * const s = SvPV_const(data->last_found, l);
5644                         SSize_t old = b - data->last_start_min;
5645                         assert(old >= 0);
5646
5647                         if (UTF)
5648                             old = utf8_hop_forward((U8*)s, old,
5649                                                (U8 *) SvEND(data->last_found))
5650                                 - (U8*)s;
5651                         l -= old;
5652                         /* Get the added string: */
5653                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5654                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5655                                             (U8*)(s + old + l)) : l;
5656                         if (deltanext == 0 && pos_before == b) {
5657                             /* What was added is a constant string */
5658                             if (mincount > 1) {
5659
5660                                 SvGROW(last_str, (mincount * l) + 1);
5661                                 repeatcpy(SvPVX(last_str) + l,
5662                                           SvPVX_const(last_str), l,
5663                                           mincount - 1);
5664                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5665                                 /* Add additional parts. */
5666                                 SvCUR_set(data->last_found,
5667                                           SvCUR(data->last_found) - l);
5668                                 sv_catsv(data->last_found, last_str);
5669                                 {
5670                                     SV * sv = data->last_found;
5671                                     MAGIC *mg =
5672                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5673                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5674                                     if (mg && mg->mg_len >= 0)
5675                                         mg->mg_len += last_chrs * (mincount-1);
5676                                 }
5677                                 last_chrs *= mincount;
5678                                 data->last_end += l * (mincount - 1);
5679                             }
5680                         } else {
5681                             /* start offset must point into the last copy */
5682                             data->last_start_min += minnext * (mincount - 1);
5683                             data->last_start_max =
5684                               is_inf
5685                                ? SSize_t_MAX
5686                                : data->last_start_max +
5687                                  (maxcount - 1) * (minnext + data->pos_delta);
5688                         }
5689                     }
5690                     /* It is counted once already... */
5691                     data->pos_min += minnext * (mincount - counted);
5692 #if 0
5693 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5694                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5695                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5696     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5697     (UV)mincount);
5698 if (deltanext != SSize_t_MAX)
5699 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5700     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5701           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5702 #endif
5703                     if (deltanext == SSize_t_MAX
5704                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5705                         data->pos_delta = SSize_t_MAX;
5706                     else
5707                         data->pos_delta += - counted * deltanext +
5708                         (minnext + deltanext) * maxcount - minnext * mincount;
5709                     if (mincount != maxcount) {
5710                          /* Cannot extend fixed substrings found inside
5711                             the group.  */
5712                         scan_commit(pRExC_state, data, minlenp, is_inf);
5713                         if (mincount && last_str) {
5714                             SV * const sv = data->last_found;
5715                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5716                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5717
5718                             if (mg)
5719                                 mg->mg_len = -1;
5720                             sv_setsv(sv, last_str);
5721                             data->last_end = data->pos_min;
5722                             data->last_start_min = data->pos_min - last_chrs;
5723                             data->last_start_max = is_inf
5724                                 ? SSize_t_MAX
5725                                 : data->pos_min + data->pos_delta - last_chrs;
5726                         }
5727                         data->cur_is_floating = 1; /* float */
5728                     }
5729                     SvREFCNT_dec(last_str);
5730                 }
5731                 if (data && (fl & SF_HAS_EVAL))
5732                     data->flags |= SF_HAS_EVAL;
5733               optimize_curly_tail:
5734                 if (OP(oscan) != CURLYX) {
5735                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5736                            && NEXT_OFF(next))
5737                         NEXT_OFF(oscan) += NEXT_OFF(next);
5738                 }
5739                 continue;
5740
5741             default:
5742 #ifdef DEBUGGING
5743                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5744                                                                     OP(scan));
5745 #endif
5746             case REF:
5747             case CLUMP:
5748                 if (flags & SCF_DO_SUBSTR) {
5749                     /* Cannot expect anything... */
5750                     scan_commit(pRExC_state, data, minlenp, is_inf);
5751                     data->cur_is_floating = 1; /* float */
5752                 }
5753                 is_inf = is_inf_internal = 1;
5754                 if (flags & SCF_DO_STCLASS_OR) {
5755                     if (OP(scan) == CLUMP) {
5756                         /* Actually is any start char, but very few code points
5757                          * aren't start characters */
5758                         ssc_match_all_cp(data->start_class);
5759                     }
5760                     else {
5761                         ssc_anything(data->start_class);
5762                     }
5763                 }
5764                 flags &= ~SCF_DO_STCLASS;
5765                 break;
5766             }
5767         }
5768         else if (OP(scan) == LNBREAK) {
5769             if (flags & SCF_DO_STCLASS) {
5770                 if (flags & SCF_DO_STCLASS_AND) {
5771                     ssc_intersection(data->start_class,
5772                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5773                     ssc_clear_locale(data->start_class);
5774                     ANYOF_FLAGS(data->start_class)
5775                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5776                 }
5777                 else if (flags & SCF_DO_STCLASS_OR) {
5778                     ssc_union(data->start_class,
5779                               PL_XPosix_ptrs[_CC_VERTSPACE],
5780                               FALSE);
5781                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5782
5783                     /* See commit msg for
5784                      * 749e076fceedeb708a624933726e7989f2302f6a */
5785                     ANYOF_FLAGS(data->start_class)
5786                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5787                 }
5788                 flags &= ~SCF_DO_STCLASS;
5789             }
5790             min++;
5791             if (delta != SSize_t_MAX)
5792                 delta++;    /* Because of the 2 char string cr-lf */
5793             if (flags & SCF_DO_SUBSTR) {
5794                 /* Cannot expect anything... */
5795                 scan_commit(pRExC_state, data, minlenp, is_inf);
5796                 data->pos_min += 1;
5797                 if (data->pos_delta != SSize_t_MAX) {
5798                     data->pos_delta += 1;
5799                 }
5800                 data->cur_is_floating = 1; /* float */
5801             }
5802         }
5803         else if (REGNODE_SIMPLE(OP(scan))) {
5804
5805             if (flags & SCF_DO_SUBSTR) {
5806                 scan_commit(pRExC_state, data, minlenp, is_inf);
5807                 data->pos_min++;
5808             }
5809             min++;
5810             if (flags & SCF_DO_STCLASS) {
5811                 bool invert = 0;
5812                 SV* my_invlist = NULL;
5813                 U8 namedclass;
5814
5815                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5816                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5817
5818                 /* Some of the logic below assumes that switching
5819                    locale on will only add false positives. */
5820                 switch (OP(scan)) {
5821
5822                 default:
5823 #ifdef DEBUGGING
5824                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5825                                                                      OP(scan));
5826 #endif
5827                 case SANY:
5828                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5829                         ssc_match_all_cp(data->start_class);
5830                     break;
5831
5832                 case REG_ANY:
5833                     {
5834                         SV* REG_ANY_invlist = _new_invlist(2);
5835                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5836                                                             '\n');
5837                         if (flags & SCF_DO_STCLASS_OR) {
5838                             ssc_union(data->start_class,
5839                                       REG_ANY_invlist,
5840                                       TRUE /* TRUE => invert, hence all but \n
5841                                             */
5842                                       );
5843                         }
5844                         else if (flags & SCF_DO_STCLASS_AND) {
5845                             ssc_intersection(data->start_class,
5846                                              REG_ANY_invlist,
5847                                              TRUE  /* TRUE => invert */
5848                                              );
5849                             ssc_clear_locale(data->start_class);
5850                         }
5851                         SvREFCNT_dec_NN(REG_ANY_invlist);
5852                     }
5853                     break;
5854
5855                 case ANYOFD:
5856                 case ANYOFL:
5857                 case ANYOFPOSIXL:
5858                 case ANYOFH:
5859                 case ANYOFHb:
5860                 case ANYOFHr:
5861                 case ANYOF:
5862                     if (flags & SCF_DO_STCLASS_AND)
5863                         ssc_and(pRExC_state, data->start_class,
5864                                 (regnode_charclass *) scan);
5865                     else
5866                         ssc_or(pRExC_state, data->start_class,
5867                                                           (regnode_charclass *) scan);
5868                     break;
5869
5870                 case NANYOFM:
5871                 case ANYOFM:
5872                   {
5873                     SV* cp_list = get_ANYOFM_contents(scan);
5874
5875                     if (flags & SCF_DO_STCLASS_OR) {
5876                         ssc_union(data->start_class, cp_list, invert);
5877                     }
5878                     else if (flags & SCF_DO_STCLASS_AND) {
5879                         ssc_intersection(data->start_class, cp_list, invert);
5880                     }
5881
5882                     SvREFCNT_dec_NN(cp_list);
5883                     break;
5884                   }
5885
5886                 case NPOSIXL:
5887                     invert = 1;
5888                     /* FALLTHROUGH */
5889
5890                 case POSIXL:
5891                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5892                     if (flags & SCF_DO_STCLASS_AND) {
5893                         bool was_there = cBOOL(
5894                                           ANYOF_POSIXL_TEST(data->start_class,
5895                                                                  namedclass));
5896                         ANYOF_POSIXL_ZERO(data->start_class);
5897                         if (was_there) {    /* Do an AND */
5898                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5899                         }
5900                         /* No individual code points can now match */
5901                         data->start_class->invlist
5902                                                 = sv_2mortal(_new_invlist(0));
5903                     }
5904                     else {
5905                         int complement = namedclass + ((invert) ? -1 : 1);
5906
5907                         assert(flags & SCF_DO_STCLASS_OR);
5908
5909                         /* If the complement of this class was already there,
5910                          * the result is that they match all code points,
5911                          * (\d + \D == everything).  Remove the classes from
5912                          * future consideration.  Locale is not relevant in
5913                          * this case */
5914                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5915                             ssc_match_all_cp(data->start_class);
5916                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5917                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5918                         }
5919                         else {  /* The usual case; just add this class to the
5920                                    existing set */
5921                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5922                         }
5923                     }
5924                     break;
5925
5926                 case NPOSIXA:   /* For these, we always know the exact set of
5927                                    what's matched */
5928                     invert = 1;
5929                     /* FALLTHROUGH */
5930                 case POSIXA:
5931                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5932                     goto join_posix_and_ascii;
5933
5934                 case NPOSIXD:
5935                 case NPOSIXU:
5936                     invert = 1;
5937                     /* FALLTHROUGH */
5938                 case POSIXD:
5939                 case POSIXU:
5940                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5941
5942                     /* NPOSIXD matches all upper Latin1 code points unless the
5943                      * target string being matched is UTF-8, which is
5944                      * unknowable until match time.  Since we are going to
5945                      * invert, we want to get rid of all of them so that the
5946                      * inversion will match all */
5947                     if (OP(scan) == NPOSIXD) {
5948                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5949                                           &my_invlist);
5950                     }
5951
5952                   join_posix_and_ascii:
5953
5954                     if (flags & SCF_DO_STCLASS_AND) {
5955                         ssc_intersection(data->start_class, my_invlist, invert);
5956                         ssc_clear_locale(data->start_class);
5957                     }
5958                     else {
5959                         assert(flags & SCF_DO_STCLASS_OR);
5960                         ssc_union(data->start_class, my_invlist, invert);
5961                     }
5962                     SvREFCNT_dec(my_invlist);
5963                 }
5964                 if (flags & SCF_DO_STCLASS_OR)
5965                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5966                 flags &= ~SCF_DO_STCLASS;
5967             }
5968         }
5969         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5970             data->flags |= (OP(scan) == MEOL
5971                             ? SF_BEFORE_MEOL
5972                             : SF_BEFORE_SEOL);
5973             scan_commit(pRExC_state, data, minlenp, is_inf);
5974
5975         }
5976         else if (  PL_regkind[OP(scan)] == BRANCHJ
5977                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5978                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5979                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5980         {
5981             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5982                 || OP(scan) == UNLESSM )
5983             {
5984                 /* Negative Lookahead/lookbehind
5985                    In this case we can't do fixed string optimisation.
5986                 */
5987
5988                 SSize_t deltanext, minnext, fake = 0;
5989                 regnode *nscan;
5990                 regnode_ssc intrnl;
5991                 int f = 0;
5992
5993                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5994                 if (data) {
5995                     data_fake.whilem_c = data->whilem_c;
5996                     data_fake.last_closep = data->last_closep;
5997                 }
5998                 else
5999                     data_fake.last_closep = &fake;
6000                 data_fake.pos_delta = delta;
6001                 if ( flags & SCF_DO_STCLASS && !scan->flags
6002                      && OP(scan) == IFMATCH ) { /* Lookahead */
6003                     ssc_init(pRExC_state, &intrnl);
6004                     data_fake.start_class = &intrnl;
6005                     f |= SCF_DO_STCLASS_AND;
6006                 }
6007                 if (flags & SCF_WHILEM_VISITED_POS)
6008                     f |= SCF_WHILEM_VISITED_POS;
6009                 next = regnext(scan);
6010                 nscan = NEXTOPER(NEXTOPER(scan));
6011
6012                 /* recurse study_chunk() for lookahead body */
6013                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6014                                       last, &data_fake, stopparen,
6015                                       recursed_depth, NULL, f, depth+1);
6016                 if (scan->flags) {
6017                     if (   deltanext < 0
6018                         || deltanext > (I32) U8_MAX
6019                         || minnext > (I32)U8_MAX
6020                         || minnext + deltanext > (I32)U8_MAX)
6021                     {
6022                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6023                               (UV)U8_MAX);
6024                     }
6025
6026                     /* The 'next_off' field has been repurposed to count the
6027                      * additional starting positions to try beyond the initial
6028                      * one.  (This leaves it at 0 for non-variable length
6029                      * matches to avoid breakage for those not using this
6030                      * extension) */
6031                     if (deltanext) {
6032                         scan->next_off = deltanext;
6033                         ckWARNexperimental(RExC_parse,
6034                             WARN_EXPERIMENTAL__VLB,
6035                             "Variable length lookbehind is experimental");
6036                     }
6037                     scan->flags = (U8)minnext + deltanext;
6038                 }
6039                 if (data) {
6040                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6041                         pars++;
6042                     if (data_fake.flags & SF_HAS_EVAL)
6043                         data->flags |= SF_HAS_EVAL;
6044                     data->whilem_c = data_fake.whilem_c;
6045                 }
6046                 if (f & SCF_DO_STCLASS_AND) {
6047                     if (flags & SCF_DO_STCLASS_OR) {
6048                         /* OR before, AND after: ideally we would recurse with
6049                          * data_fake to get the AND applied by study of the
6050                          * remainder of the pattern, and then derecurse;
6051                          * *** HACK *** for now just treat as "no information".
6052                          * See [perl #56690].
6053                          */
6054                         ssc_init(pRExC_state, data->start_class);
6055                     }  else {
6056                         /* AND before and after: combine and continue.  These
6057                          * assertions are zero-length, so can match an EMPTY
6058                          * string */
6059                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6060                         ANYOF_FLAGS(data->start_class)
6061                                                    |= SSC_MATCHES_EMPTY_STRING;
6062                     }
6063                 }
6064             }
6065 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6066             else {
6067                 /* Positive Lookahead/lookbehind
6068                    In this case we can do fixed string optimisation,
6069                    but we must be careful about it. Note in the case of
6070                    lookbehind the positions will be offset by the minimum
6071                    length of the pattern, something we won't know about
6072                    until after the recurse.
6073                 */
6074                 SSize_t deltanext, fake = 0;
6075                 regnode *nscan;
6076                 regnode_ssc intrnl;
6077                 int f = 0;
6078                 /* We use SAVEFREEPV so that when the full compile
6079                     is finished perl will clean up the allocated
6080                     minlens when it's all done. This way we don't
6081                     have to worry about freeing them when we know
6082                     they wont be used, which would be a pain.
6083                  */
6084                 SSize_t *minnextp;
6085                 Newx( minnextp, 1, SSize_t );
6086                 SAVEFREEPV(minnextp);
6087
6088                 if (data) {
6089                     StructCopy(data, &data_fake, scan_data_t);
6090                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6091                         f |= SCF_DO_SUBSTR;
6092                         if (scan->flags)
6093                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6094                         data_fake.last_found=newSVsv(data->last_found);
6095                     }
6096                 }
6097                 else
6098                     data_fake.last_closep = &fake;
6099                 data_fake.flags = 0;
6100                 data_fake.substrs[0].flags = 0;
6101                 data_fake.substrs[1].flags = 0;
6102                 data_fake.pos_delta = delta;
6103                 if (is_inf)
6104                     data_fake.flags |= SF_IS_INF;
6105                 if ( flags & SCF_DO_STCLASS && !scan->flags
6106                      && OP(scan) == IFMATCH ) { /* Lookahead */
6107                     ssc_init(pRExC_state, &intrnl);
6108                     data_fake.start_class = &intrnl;
6109                     f |= SCF_DO_STCLASS_AND;
6110                 }
6111                 if (flags & SCF_WHILEM_VISITED_POS)
6112                     f |= SCF_WHILEM_VISITED_POS;
6113                 next = regnext(scan);
6114                 nscan = NEXTOPER(NEXTOPER(scan));
6115
6116                 /* positive lookahead study_chunk() recursion */
6117                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6118                                         &deltanext, last, &data_fake,
6119                                         stopparen, recursed_depth, NULL,
6120                                         f, depth+1);
6121                 if (scan->flags) {
6122                     assert(0);  /* This code has never been tested since this
6123                                    is normally not compiled */
6124                     if (   deltanext < 0
6125                         || deltanext > (I32) U8_MAX
6126                         || *minnextp > (I32)U8_MAX
6127                         || *minnextp + deltanext > (I32)U8_MAX)
6128                     {
6129                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6130                               (UV)U8_MAX);
6131                     }
6132
6133                     if (deltanext) {
6134                         scan->next_off = deltanext;
6135                     }
6136                     scan->flags = (U8)*minnextp + deltanext;
6137                 }
6138
6139                 *minnextp += min;
6140
6141                 if (f & SCF_DO_STCLASS_AND) {
6142                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6143                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6144                 }
6145                 if (data) {
6146                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6147                         pars++;
6148                     if (data_fake.flags & SF_HAS_EVAL)
6149                         data->flags |= SF_HAS_EVAL;
6150                     data->whilem_c = data_fake.whilem_c;
6151                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6152                         int i;
6153                         if (RExC_rx->minlen<*minnextp)
6154                             RExC_rx->minlen=*minnextp;
6155                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6156                         SvREFCNT_dec_NN(data_fake.last_found);
6157
6158                         for (i = 0; i < 2; i++) {
6159                             if (data_fake.substrs[i].minlenp != minlenp) {
6160                                 data->substrs[i].min_offset =
6161                                             data_fake.substrs[i].min_offset;
6162                                 data->substrs[i].max_offset =
6163                                             data_fake.substrs[i].max_offset;
6164                                 data->substrs[i].minlenp =
6165                                             data_fake.substrs[i].minlenp;
6166                                 data->substrs[i].lookbehind += scan->flags;
6167                             }
6168                         }
6169                     }
6170                 }
6171             }
6172 #endif
6173         }
6174
6175         else if (OP(scan) == OPEN) {
6176             if (stopparen != (I32)ARG(scan))
6177                 pars++;
6178         }
6179         else if (OP(scan) == CLOSE) {
6180             if (stopparen == (I32)ARG(scan)) {
6181                 break;
6182             }
6183             if ((I32)ARG(scan) == is_par) {
6184                 next = regnext(scan);
6185
6186                 if ( next && (OP(next) != WHILEM) && next < last)
6187                     is_par = 0;         /* Disable optimization */
6188             }
6189             if (data)
6190                 *(data->last_closep) = ARG(scan);
6191         }
6192         else if (OP(scan) == EVAL) {
6193                 if (data)
6194                     data->flags |= SF_HAS_EVAL;
6195         }
6196         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6197             if (flags & SCF_DO_SUBSTR) {
6198                 scan_commit(pRExC_state, data, minlenp, is_inf);
6199                 flags &= ~SCF_DO_SUBSTR;
6200             }
6201             if (data && OP(scan)==ACCEPT) {
6202                 data->flags |= SCF_SEEN_ACCEPT;
6203                 if (stopmin > min)
6204                     stopmin = min;
6205             }
6206         }
6207         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6208         {
6209                 if (flags & SCF_DO_SUBSTR) {
6210                     scan_commit(pRExC_state, data, minlenp, is_inf);
6211                     data->cur_is_floating = 1; /* float */
6212                 }
6213                 is_inf = is_inf_internal = 1;
6214                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6215                     ssc_anything(data->start_class);
6216                 flags &= ~SCF_DO_STCLASS;
6217         }
6218         else if (OP(scan) == GPOS) {
6219             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6220                 !(delta || is_inf || (data && data->pos_delta)))
6221             {
6222                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6223                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6224                 if (RExC_rx->gofs < (STRLEN)min)
6225                     RExC_rx->gofs = min;
6226             } else {
6227                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6228                 RExC_rx->gofs = 0;
6229             }
6230         }
6231 #ifdef TRIE_STUDY_OPT
6232 #ifdef FULL_TRIE_STUDY
6233         else if (PL_regkind[OP(scan)] == TRIE) {
6234             /* NOTE - There is similar code to this block above for handling
6235                BRANCH nodes on the initial study.  If you change stuff here
6236                check there too. */
6237             regnode *trie_node= scan;
6238             regnode *tail= regnext(scan);
6239             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6240             SSize_t max1 = 0, min1 = SSize_t_MAX;
6241             regnode_ssc accum;
6242
6243             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6244                 /* Cannot merge strings after this. */
6245                 scan_commit(pRExC_state, data, minlenp, is_inf);
6246             }
6247             if (flags & SCF_DO_STCLASS)
6248                 ssc_init_zero(pRExC_state, &accum);
6249
6250             if (!trie->jump) {
6251                 min1= trie->minlen;
6252                 max1= trie->maxlen;
6253             } else {
6254                 const regnode *nextbranch= NULL;
6255                 U32 word;
6256
6257                 for ( word=1 ; word <= trie->wordcount ; word++)
6258                 {
6259                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6260                     regnode_ssc this_class;
6261
6262                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6263                     if (data) {
6264                         data_fake.whilem_c = data->whilem_c;
6265                         data_fake.last_closep = data->last_closep;
6266                     }
6267                     else
6268                         data_fake.last_closep = &fake;
6269                     data_fake.pos_delta = delta;
6270                     if (flags & SCF_DO_STCLASS) {
6271                         ssc_init(pRExC_state, &this_class);
6272                         data_fake.start_class = &this_class;
6273                         f = SCF_DO_STCLASS_AND;
6274                     }
6275                     if (flags & SCF_WHILEM_VISITED_POS)
6276                         f |= SCF_WHILEM_VISITED_POS;
6277
6278                     if (trie->jump[word]) {
6279                         if (!nextbranch)
6280                             nextbranch = trie_node + trie->jump[0];
6281                         scan= trie_node + trie->jump[word];
6282                         /* We go from the jump point to the branch that follows
6283                            it. Note this means we need the vestigal unused
6284                            branches even though they arent otherwise used. */
6285                         /* optimise study_chunk() for TRIE */
6286                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6287                             &deltanext, (regnode *)nextbranch, &data_fake,
6288                             stopparen, recursed_depth, NULL, f, depth+1);
6289                     }
6290                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6291                         nextbranch= regnext((regnode*)nextbranch);
6292
6293                     if (min1 > (SSize_t)(minnext + trie->minlen))
6294                         min1 = minnext + trie->minlen;
6295                     if (deltanext == SSize_t_MAX) {
6296                         is_inf = is_inf_internal = 1;
6297                         max1 = SSize_t_MAX;
6298                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6299                         max1 = minnext + deltanext + trie->maxlen;
6300
6301                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6302                         pars++;
6303                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6304                         if ( stopmin > min + min1)
6305                             stopmin = min + min1;
6306                         flags &= ~SCF_DO_SUBSTR;
6307                         if (data)
6308                             data->flags |= SCF_SEEN_ACCEPT;
6309                     }
6310                     if (data) {
6311                         if (data_fake.flags & SF_HAS_EVAL)
6312                             data->flags |= SF_HAS_EVAL;
6313                         data->whilem_c = data_fake.whilem_c;
6314                     }
6315                     if (flags & SCF_DO_STCLASS)
6316                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6317                 }
6318             }
6319             if (flags & SCF_DO_SUBSTR) {
6320                 data->pos_min += min1;
6321                 data->pos_delta += max1 - min1;
6322                 if (max1 != min1 || is_inf)
6323                     data->cur_is_floating = 1; /* float */
6324             }
6325             min += min1;
6326             if (delta != SSize_t_MAX) {
6327                 if (SSize_t_MAX - (max1 - min1) >= delta)
6328                     delta += max1 - min1;
6329                 else
6330                     delta = SSize_t_MAX;
6331             }
6332             if (flags & SCF_DO_STCLASS_OR) {
6333                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6334                 if (min1) {
6335                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6336                     flags &= ~SCF_DO_STCLASS;
6337                 }
6338             }
6339             else if (flags & SCF_DO_STCLASS_AND) {
6340                 if (min1) {
6341                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6342                     flags &= ~SCF_DO_STCLASS;
6343                 }
6344                 else {
6345                     /* Switch to OR mode: cache the old value of
6346                      * data->start_class */
6347                     INIT_AND_WITHP;
6348                     StructCopy(data->start_class, and_withp, regnode_ssc);
6349                     flags &= ~SCF_DO_STCLASS_AND;
6350                     StructCopy(&accum, data->start_class, regnode_ssc);
6351                     flags |= SCF_DO_STCLASS_OR;
6352                 }
6353             }
6354             scan= tail;
6355             continue;
6356         }
6357 #else
6358         else if (PL_regkind[OP(scan)] == TRIE) {
6359             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6360             U8*bang=NULL;
6361
6362             min += trie->minlen;
6363             delta += (trie->maxlen - trie->minlen);
6364             flags &= ~SCF_DO_STCLASS; /* xxx */
6365             if (flags & SCF_DO_SUBSTR) {
6366                 /* Cannot expect anything... */
6367                 scan_commit(pRExC_state, data, minlenp, is_inf);
6368                 data->pos_min += trie->minlen;
6369                 data->pos_delta += (trie->maxlen - trie->minlen);
6370                 if (trie->maxlen != trie->minlen)
6371                     data->cur_is_floating = 1; /* float */
6372             }
6373             if (trie->jump) /* no more substrings -- for now /grr*/
6374                flags &= ~SCF_DO_SUBSTR;
6375         }
6376 #endif /* old or new */
6377 #endif /* TRIE_STUDY_OPT */
6378
6379         /* Else: zero-length, ignore. */
6380         scan = regnext(scan);
6381     }
6382
6383   finish:
6384     if (frame) {
6385         /* we need to unwind recursion. */
6386         depth = depth - 1;
6387
6388         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6389         DEBUG_PEEP("fend", scan, depth, flags);
6390
6391         /* restore previous context */
6392         last = frame->last_regnode;
6393         scan = frame->next_regnode;
6394         stopparen = frame->stopparen;
6395         recursed_depth = frame->prev_recursed_depth;
6396
6397         RExC_frame_last = frame->prev_frame;
6398         frame = frame->this_prev_frame;
6399         goto fake_study_recurse;
6400     }
6401
6402     assert(!frame);
6403     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6404
6405     *scanp = scan;
6406     *deltap = is_inf_internal ? SSize_t_MAX : delta;
6407
6408     if (flags & SCF_DO_SUBSTR && is_inf)
6409         data->pos_delta = SSize_t_MAX - data->pos_min;
6410     if (is_par > (I32)U8_MAX)
6411         is_par = 0;
6412     if (is_par && pars==1 && data) {
6413         data->flags |= SF_IN_PAR;
6414         data->flags &= ~SF_HAS_PAR;
6415     }
6416     else if (pars && data) {
6417         data->flags |= SF_HAS_PAR;
6418         data->flags &= ~SF_IN_PAR;
6419     }
6420     if (flags & SCF_DO_STCLASS_OR)
6421         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6422     if (flags & SCF_TRIE_RESTUDY)
6423         data->flags |=  SCF_TRIE_RESTUDY;
6424
6425     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6426
6427     {
6428         SSize_t final_minlen= min < stopmin ? min : stopmin;
6429
6430         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6431             if (final_minlen > SSize_t_MAX - delta)
6432                 RExC_maxlen = SSize_t_MAX;
6433             else if (RExC_maxlen < final_minlen + delta)
6434                 RExC_maxlen = final_minlen + delta;
6435         }
6436         return final_minlen;
6437     }
6438     NOT_REACHED; /* NOTREACHED */
6439 }
6440
6441 STATIC U32
6442 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6443 {
6444     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6445
6446     PERL_ARGS_ASSERT_ADD_DATA;
6447
6448     Renewc(RExC_rxi->data,
6449            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6450            char, struct reg_data);
6451     if(count)
6452         Renew(RExC_rxi->data->what, count + n, U8);
6453     else
6454         Newx(RExC_rxi->data->what, n, U8);
6455     RExC_rxi->data->count = count + n;
6456     Copy(s, RExC_rxi->data->what + count, n, U8);
6457     return count;
6458 }
6459
6460 /*XXX: todo make this not included in a non debugging perl, but appears to be
6461  * used anyway there, in 'use re' */
6462 #ifndef PERL_IN_XSUB_RE
6463 void
6464 Perl_reginitcolors(pTHX)
6465 {
6466     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6467     if (s) {
6468         char *t = savepv(s);
6469         int i = 0;
6470         PL_colors[0] = t;
6471         while (++i < 6) {
6472             t = strchr(t, '\t');
6473             if (t) {
6474                 *t = '\0';
6475                 PL_colors[i] = ++t;
6476             }
6477             else
6478                 PL_colors[i] = t = (char *)"";
6479         }
6480     } else {
6481         int i = 0;
6482         while (i < 6)
6483             PL_colors[i++] = (char *)"";
6484     }
6485     PL_colorset = 1;
6486 }
6487 #endif
6488
6489
6490 #ifdef TRIE_STUDY_OPT
6491 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6492     STMT_START {                                            \
6493         if (                                                \
6494               (data.flags & SCF_TRIE_RESTUDY)               \
6495               && ! restudied++                              \
6496         ) {                                                 \
6497             dOsomething;                                    \
6498             goto reStudy;                                   \
6499         }                                                   \
6500     } STMT_END
6501 #else
6502 #define CHECK_RESTUDY_GOTO_butfirst
6503 #endif
6504
6505 /*
6506  * pregcomp - compile a regular expression into internal code
6507  *
6508  * Decides which engine's compiler to call based on the hint currently in
6509  * scope
6510  */
6511
6512 #ifndef PERL_IN_XSUB_RE
6513
6514 /* return the currently in-scope regex engine (or the default if none)  */
6515
6516 regexp_engine const *
6517 Perl_current_re_engine(pTHX)
6518 {
6519     if (IN_PERL_COMPILETIME) {
6520         HV * const table = GvHV(PL_hintgv);
6521         SV **ptr;
6522
6523         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6524             return &PL_core_reg_engine;
6525         ptr = hv_fetchs(table, "regcomp", FALSE);
6526         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6527             return &PL_core_reg_engine;
6528         return INT2PTR(regexp_engine*, SvIV(*ptr));
6529     }
6530     else {
6531         SV *ptr;
6532         if (!PL_curcop->cop_hints_hash)
6533             return &PL_core_reg_engine;
6534         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6535         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6536             return &PL_core_reg_engine;
6537         return INT2PTR(regexp_engine*, SvIV(ptr));
6538     }
6539 }
6540
6541
6542 REGEXP *
6543 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6544 {
6545     regexp_engine const *eng = current_re_engine();
6546     GET_RE_DEBUG_FLAGS_DECL;
6547
6548     PERL_ARGS_ASSERT_PREGCOMP;
6549
6550     /* Dispatch a request to compile a regexp to correct regexp engine. */
6551     DEBUG_COMPILE_r({
6552         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6553                         PTR2UV(eng));
6554     });
6555     return CALLREGCOMP_ENG(eng, pattern, flags);
6556 }
6557 #endif
6558
6559 /* public(ish) entry point for the perl core's own regex compiling code.
6560  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6561  * pattern rather than a list of OPs, and uses the internal engine rather
6562  * than the current one */
6563
6564 REGEXP *
6565 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6566 {
6567     SV *pat = pattern; /* defeat constness! */
6568     PERL_ARGS_ASSERT_RE_COMPILE;
6569     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6570 #ifdef PERL_IN_XSUB_RE
6571                                 &my_reg_engine,
6572 #else
6573                                 &PL_core_reg_engine,
6574 #endif
6575                                 NULL, NULL, rx_flags, 0);
6576 }
6577
6578
6579 static void
6580 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6581 {
6582     int n;
6583
6584     if (--cbs->refcnt > 0)
6585         return;
6586     for (n = 0; n < cbs->count; n++) {
6587         REGEXP *rx = cbs->cb[n].src_regex;
6588         if (rx) {
6589             cbs->cb[n].src_regex = NULL;
6590             SvREFCNT_dec_NN(rx);
6591         }
6592     }
6593     Safefree(cbs->cb);
6594     Safefree(cbs);
6595 }
6596
6597
6598 static struct reg_code_blocks *
6599 S_alloc_code_blocks(pTHX_  int ncode)
6600 {
6601      struct reg_code_blocks *cbs;
6602     Newx(cbs, 1, struct reg_code_blocks);
6603     cbs->count = ncode;
6604     cbs->refcnt = 1;
6605     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6606     if (ncode)
6607         Newx(cbs->cb, ncode, struct reg_code_block);
6608     else
6609         cbs->cb = NULL;
6610     return cbs;
6611 }
6612
6613
6614 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6615  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6616  * point to the realloced string and length.
6617  *
6618  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6619  * stuff added */
6620
6621 static void
6622 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6623                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6624 {
6625     U8 *const src = (U8*)*pat_p;
6626     U8 *dst, *d;
6627     int n=0;
6628     STRLEN s = 0;
6629     bool do_end = 0;
6630     GET_RE_DEBUG_FLAGS_DECL;
6631
6632     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6633         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6634
6635     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6636     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6637     d = dst;
6638
6639     while (s < *plen_p) {
6640         append_utf8_from_native_byte(src[s], &d);
6641
6642         if (n < num_code_blocks) {
6643             assert(pRExC_state->code_blocks);
6644             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6645                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6646                 assert(*(d - 1) == '(');
6647                 do_end = 1;
6648             }
6649             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6650                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6651                 assert(*(d - 1) == ')');
6652                 do_end = 0;
6653                 n++;
6654             }
6655         }
6656         s++;
6657     }
6658     *d = '\0';
6659     *plen_p = d - dst;
6660     *pat_p = (char*) dst;
6661     SAVEFREEPV(*pat_p);
6662     RExC_orig_utf8 = RExC_utf8 = 1;
6663 }
6664
6665
6666
6667 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6668  * while recording any code block indices, and handling overloading,
6669  * nested qr// objects etc.  If pat is null, it will allocate a new
6670  * string, or just return the first arg, if there's only one.
6671  *
6672  * Returns the malloced/updated pat.
6673  * patternp and pat_count is the array of SVs to be concatted;
6674  * oplist is the optional list of ops that generated the SVs;
6675  * recompile_p is a pointer to a boolean that will be set if
6676  *   the regex will need to be recompiled.
6677  * delim, if non-null is an SV that will be inserted between each element
6678  */
6679
6680 static SV*
6681 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6682                 SV *pat, SV ** const patternp, int pat_count,
6683                 OP *oplist, bool *recompile_p, SV *delim)
6684 {
6685     SV **svp;
6686     int n = 0;
6687     bool use_delim = FALSE;
6688     bool alloced = FALSE;
6689
6690     /* if we know we have at least two args, create an empty string,
6691      * then concatenate args to that. For no args, return an empty string */
6692     if (!pat && pat_count != 1) {
6693         pat = newSVpvs("");
6694         SAVEFREESV(pat);
6695         alloced = TRUE;
6696     }
6697
6698     for (svp = patternp; svp < patternp + pat_count; svp++) {
6699         SV *sv;
6700         SV *rx  = NULL;
6701         STRLEN orig_patlen = 0;
6702         bool code = 0;
6703         SV *msv = use_delim ? delim : *svp;
6704         if (!msv) msv = &PL_sv_undef;
6705
6706         /* if we've got a delimiter, we go round the loop twice for each
6707          * svp slot (except the last), using the delimiter the second
6708          * time round */
6709         if (use_delim) {
6710             svp--;
6711             use_delim = FALSE;
6712         }
6713         else if (delim)
6714             use_delim = TRUE;
6715
6716         if (SvTYPE(msv) == SVt_PVAV) {
6717             /* we've encountered an interpolated array within
6718              * the pattern, e.g. /...@a..../. Expand the list of elements,
6719              * then recursively append elements.
6720              * The code in this block is based on S_pushav() */
6721
6722             AV *const av = (AV*)msv;
6723             const SSize_t maxarg = AvFILL(av) + 1;
6724             SV **array;
6725
6726             if (oplist) {
6727                 assert(oplist->op_type == OP_PADAV
6728                     || oplist->op_type == OP_RV2AV);
6729                 oplist = OpSIBLING(oplist);
6730             }
6731
6732             if (SvRMAGICAL(av)) {
6733                 SSize_t i;
6734
6735                 Newx(array, maxarg, SV*);
6736                 SAVEFREEPV(array);
6737                 for (i=0; i < maxarg; i++) {
6738                     SV ** const svp = av_fetch(av, i, FALSE);
6739                     array[i] = svp ? *svp : &PL_sv_undef;
6740                 }
6741             }
6742             else
6743                 array = AvARRAY(av);
6744
6745             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6746                                 array, maxarg, NULL, recompile_p,
6747                                 /* $" */
6748                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6749
6750             continue;
6751         }
6752
6753
6754         /* we make the assumption here that each op in the list of
6755          * op_siblings maps to one SV pushed onto the stack,
6756          * except for code blocks, with have both an OP_NULL and
6757          * and OP_CONST.
6758          * This allows us to match up the list of SVs against the
6759          * list of OPs to find the next code block.
6760          *
6761          * Note that       PUSHMARK PADSV PADSV ..
6762          * is optimised to
6763          *                 PADRANGE PADSV  PADSV  ..
6764          * so the alignment still works. */
6765
6766         if (oplist) {
6767             if (oplist->op_type == OP_NULL
6768                 && (oplist->op_flags & OPf_SPECIAL))
6769             {
6770                 assert(n < pRExC_state->code_blocks->count);
6771                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6772                 pRExC_state->code_blocks->cb[n].block = oplist;
6773                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6774                 n++;
6775                 code = 1;
6776                 oplist = OpSIBLING(oplist); /* skip CONST */
6777                 assert(oplist);
6778             }
6779             oplist = OpSIBLING(oplist);;
6780         }
6781
6782         /* apply magic and QR overloading to arg */
6783
6784         SvGETMAGIC(msv);
6785         if (SvROK(msv) && SvAMAGIC(msv)) {
6786             SV *sv = AMG_CALLunary(msv, regexp_amg);
6787             if (sv) {
6788                 if (SvROK(sv))
6789                     sv = SvRV(sv);
6790                 if (SvTYPE(sv) != SVt_REGEXP)
6791                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6792                 msv = sv;
6793             }
6794         }
6795
6796         /* try concatenation overload ... */
6797         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6798                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6799         {
6800             sv_setsv(pat, sv);
6801             /* overloading involved: all bets are off over literal
6802              * code. Pretend we haven't seen it */
6803             if (n)
6804                 pRExC_state->code_blocks->count -= n;
6805             n = 0;
6806         }
6807         else  {
6808             /* ... or failing that, try "" overload */
6809             while (SvAMAGIC(msv)
6810                     && (sv = AMG_CALLunary(msv, string_amg))
6811                     && sv != msv
6812                     &&  !(   SvROK(msv)
6813                           && SvROK(sv)
6814                           && SvRV(msv) == SvRV(sv))
6815             ) {
6816                 msv = sv;
6817                 SvGETMAGIC(msv);
6818             }
6819             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6820                 msv = SvRV(msv);
6821
6822             if (pat) {
6823                 /* this is a partially unrolled
6824                  *     sv_catsv_nomg(pat, msv);
6825                  * that allows us to adjust code block indices if
6826                  * needed */
6827                 STRLEN dlen;
6828                 char *dst = SvPV_force_nomg(pat, dlen);
6829                 orig_patlen = dlen;
6830                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6831                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6832                     sv_setpvn(pat, dst, dlen);
6833                     SvUTF8_on(pat);
6834                 }
6835                 sv_catsv_nomg(pat, msv);
6836                 rx = msv;
6837             }
6838             else {
6839                 /* We have only one SV to process, but we need to verify
6840                  * it is properly null terminated or we will fail asserts
6841                  * later. In theory we probably shouldn't get such SV's,
6842                  * but if we do we should handle it gracefully. */
6843                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6844                     /* not a string, or a string with a trailing null */
6845                     pat = msv;
6846                 } else {
6847                     /* a string with no trailing null, we need to copy it
6848                      * so it has a trailing null */
6849                     pat = sv_2mortal(newSVsv(msv));
6850                 }
6851             }
6852
6853             if (code)
6854                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6855         }
6856
6857         /* extract any code blocks within any embedded qr//'s */
6858         if (rx && SvTYPE(rx) == SVt_REGEXP
6859             && RX_ENGINE((REGEXP*)rx)->op_comp)
6860         {
6861
6862             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6863             if (ri->code_blocks && ri->code_blocks->count) {
6864                 int i;
6865                 /* the presence of an embedded qr// with code means
6866                  * we should always recompile: the text of the
6867                  * qr// may not have changed, but it may be a
6868                  * different closure than last time */
6869                 *recompile_p = 1;
6870                 if (pRExC_state->code_blocks) {
6871                     int new_count = pRExC_state->code_blocks->count
6872                             + ri->code_blocks->count;
6873                     Renew(pRExC_state->code_blocks->cb,
6874                             new_count, struct reg_code_block);
6875                     pRExC_state->code_blocks->count = new_count;
6876                 }
6877                 else
6878                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6879                                                     ri->code_blocks->count);
6880
6881                 for (i=0; i < ri->code_blocks->count; i++) {
6882                     struct reg_code_block *src, *dst;
6883                     STRLEN offset =  orig_patlen
6884                         + ReANY((REGEXP *)rx)->pre_prefix;
6885                     assert(n < pRExC_state->code_blocks->count);
6886                     src = &ri->code_blocks->cb[i];
6887                     dst = &pRExC_state->code_blocks->cb[n];
6888                     dst->start      = src->start + offset;
6889                     dst->end        = src->end   + offset;
6890                     dst->block      = src->block;
6891                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6892                                             src->src_regex
6893                                                 ? src->src_regex
6894                                                 : (REGEXP*)rx);
6895                     n++;
6896                 }
6897             }
6898         }
6899     }
6900     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6901     if (alloced)
6902         SvSETMAGIC(pat);
6903
6904     return pat;
6905 }
6906
6907
6908
6909 /* see if there are any run-time code blocks in the pattern.
6910  * False positives are allowed */
6911
6912 static bool
6913 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6914                     char *pat, STRLEN plen)
6915 {
6916     int n = 0;
6917     STRLEN s;
6918
6919     PERL_UNUSED_CONTEXT;
6920
6921     for (s = 0; s < plen; s++) {
6922         if (   pRExC_state->code_blocks
6923             && n < pRExC_state->code_blocks->count
6924             && s == pRExC_state->code_blocks->cb[n].start)
6925         {
6926             s = pRExC_state->code_blocks->cb[n].end;
6927             n++;
6928             continue;
6929         }
6930         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6931          * positives here */
6932         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6933             (pat[s+2] == '{'
6934                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6935         )
6936             return 1;
6937     }
6938     return 0;
6939 }
6940
6941 /* Handle run-time code blocks. We will already have compiled any direct
6942  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6943  * copy of it, but with any literal code blocks blanked out and
6944  * appropriate chars escaped; then feed it into
6945  *
6946  *    eval "qr'modified_pattern'"
6947  *
6948  * For example,
6949  *
6950  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6951  *
6952  * becomes
6953  *
6954  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6955  *
6956  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6957  * and merge them with any code blocks of the original regexp.
6958  *
6959  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6960  * instead, just save the qr and return FALSE; this tells our caller that
6961  * the original pattern needs upgrading to utf8.
6962  */
6963
6964 static bool
6965 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6966     char *pat, STRLEN plen)
6967 {
6968     SV *qr;
6969
6970     GET_RE_DEBUG_FLAGS_DECL;
6971
6972     if (pRExC_state->runtime_code_qr) {
6973         /* this is the second time we've been called; this should
6974          * only happen if the main pattern got upgraded to utf8
6975          * during compilation; re-use the qr we compiled first time
6976          * round (which should be utf8 too)
6977          */
6978         qr = pRExC_state->runtime_code_qr;
6979         pRExC_state->runtime_code_qr = NULL;
6980         assert(RExC_utf8 && SvUTF8(qr));
6981     }
6982     else {
6983         int n = 0;
6984         STRLEN s;
6985         char *p, *newpat;
6986         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6987         SV *sv, *qr_ref;
6988         dSP;
6989
6990         /* determine how many extra chars we need for ' and \ escaping */
6991         for (s = 0; s < plen; s++) {
6992             if (pat[s] == '\'' || pat[s] == '\\')
6993                 newlen++;
6994         }
6995
6996         Newx(newpat, newlen, char);
6997         p = newpat;
6998         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6999
7000         for (s = 0; s < plen; s++) {
7001             if (   pRExC_state->code_blocks
7002                 && n < pRExC_state->code_blocks->count
7003                 && s == pRExC_state->code_blocks->cb[n].start)
7004             {
7005                 /* blank out literal code block so that they aren't
7006                  * recompiled: eg change from/to:
7007                  *     /(?{xyz})/
7008                  *     /(?=====)/
7009                  * and
7010                  *     /(??{xyz})/
7011                  *     /(?======)/
7012                  * and
7013                  *     /(?(?{xyz}))/
7014                  *     /(?(?=====))/
7015                 */
7016                 assert(pat[s]   == '(');
7017                 assert(pat[s+1] == '?');
7018                 *p++ = '(';
7019                 *p++ = '?';
7020                 s += 2;
7021                 while (s < pRExC_state->code_blocks->cb[n].end) {
7022                     *p++ = '=';
7023                     s++;
7024                 }
7025                 *p++ = ')';
7026                 n++;
7027                 continue;
7028             }
7029             if (pat[s] == '\'' || pat[s] == '\\')
7030                 *p++ = '\\';
7031             *p++ = pat[s];
7032         }
7033         *p++ = '\'';
7034         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7035             *p++ = 'x';
7036             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7037                 *p++ = 'x';
7038             }
7039         }
7040         *p++ = '\0';
7041         DEBUG_COMPILE_r({
7042             Perl_re_printf( aTHX_
7043                 "%sre-parsing pattern for runtime code:%s %s\n",
7044                 PL_colors[4], PL_colors[5], newpat);
7045         });
7046
7047         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7048         Safefree(newpat);
7049
7050         ENTER;
7051         SAVETMPS;
7052         save_re_context();
7053         PUSHSTACKi(PERLSI_REQUIRE);
7054         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7055          * parsing qr''; normally only q'' does this. It also alters
7056          * hints handling */
7057         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7058         SvREFCNT_dec_NN(sv);
7059         SPAGAIN;
7060         qr_ref = POPs;
7061         PUTBACK;
7062         {
7063             SV * const errsv = ERRSV;
7064             if (SvTRUE_NN(errsv))
7065                 /* use croak_sv ? */
7066                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7067         }
7068         assert(SvROK(qr_ref));
7069         qr = SvRV(qr_ref);
7070         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7071         /* the leaving below frees the tmp qr_ref.
7072          * Give qr a life of its own */
7073         SvREFCNT_inc(qr);
7074         POPSTACK;
7075         FREETMPS;
7076         LEAVE;
7077
7078     }
7079
7080     if (!RExC_utf8 && SvUTF8(qr)) {
7081         /* first time through; the pattern got upgraded; save the
7082          * qr for the next time through */
7083         assert(!pRExC_state->runtime_code_qr);
7084         pRExC_state->runtime_code_qr = qr;
7085         return 0;
7086     }
7087
7088
7089     /* extract any code blocks within the returned qr//  */
7090
7091
7092     /* merge the main (r1) and run-time (r2) code blocks into one */
7093     {
7094         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7095         struct reg_code_block *new_block, *dst;
7096         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7097         int i1 = 0, i2 = 0;
7098         int r1c, r2c;
7099
7100         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7101         {
7102             SvREFCNT_dec_NN(qr);
7103             return 1;
7104         }
7105
7106         if (!r1->code_blocks)
7107             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7108
7109         r1c = r1->code_blocks->count;
7110         r2c = r2->code_blocks->count;
7111
7112         Newx(new_block, r1c + r2c, struct reg_code_block);
7113
7114         dst = new_block;
7115
7116         while (i1 < r1c || i2 < r2c) {
7117             struct reg_code_block *src;
7118             bool is_qr = 0;
7119
7120             if (i1 == r1c) {
7121                 src = &r2->code_blocks->cb[i2++];
7122                 is_qr = 1;
7123             }
7124             else if (i2 == r2c)
7125                 src = &r1->code_blocks->cb[i1++];
7126             else if (  r1->code_blocks->cb[i1].start
7127                      < r2->code_blocks->cb[i2].start)
7128             {
7129                 src = &r1->code_blocks->cb[i1++];
7130                 assert(src->end < r2->code_blocks->cb[i2].start);
7131             }
7132             else {
7133                 assert(  r1->code_blocks->cb[i1].start
7134                        > r2->code_blocks->cb[i2].start);
7135                 src = &r2->code_blocks->cb[i2++];
7136                 is_qr = 1;
7137                 assert(src->end < r1->code_blocks->cb[i1].start);
7138             }
7139
7140             assert(pat[src->start] == '(');
7141             assert(pat[src->end]   == ')');
7142             dst->start      = src->start;
7143             dst->end        = src->end;
7144             dst->block      = src->block;
7145             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7146                                     : src->src_regex;
7147             dst++;
7148         }
7149         r1->code_blocks->count += r2c;
7150         Safefree(r1->code_blocks->cb);
7151         r1->code_blocks->cb = new_block;
7152     }
7153
7154     SvREFCNT_dec_NN(qr);
7155     return 1;
7156 }
7157
7158
7159 STATIC bool
7160 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7161                       struct reg_substr_datum  *rsd,
7162                       struct scan_data_substrs *sub,
7163                       STRLEN longest_length)
7164 {
7165     /* This is the common code for setting up the floating and fixed length
7166      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7167      * as to whether succeeded or not */
7168
7169     I32 t;
7170     SSize_t ml;
7171     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7172     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7173
7174     if (! (longest_length
7175            || (eol /* Can't have SEOL and MULTI */
7176                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7177           )
7178             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7179         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7180     {
7181         return FALSE;
7182     }
7183
7184     /* copy the information about the longest from the reg_scan_data
7185         over to the program. */
7186     if (SvUTF8(sub->str)) {
7187         rsd->substr      = NULL;
7188         rsd->utf8_substr = sub->str;
7189     } else {
7190         rsd->substr      = sub->str;
7191         rsd->utf8_substr = NULL;
7192     }
7193     /* end_shift is how many chars that must be matched that
7194         follow this item. We calculate it ahead of time as once the
7195         lookbehind offset is added in we lose the ability to correctly
7196         calculate it.*/
7197     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7198     rsd->end_shift = ml - sub->min_offset
7199         - longest_length
7200             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7201              * intead? - DAPM
7202             + (SvTAIL(sub->str) != 0)
7203             */
7204         + sub->lookbehind;
7205
7206     t = (eol/* Can't have SEOL and MULTI */
7207          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7208     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7209
7210     return TRUE;
7211 }
7212
7213 STATIC void
7214 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7215 {
7216     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7217      * properly wrapped with the right modifiers */
7218
7219     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7220     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7221                                                 != REGEX_DEPENDS_CHARSET);
7222
7223     /* The caret is output if there are any defaults: if not all the STD
7224         * flags are set, or if no character set specifier is needed */
7225     bool has_default =
7226                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7227                 || ! has_charset);
7228     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7229                                                 == REG_RUN_ON_COMMENT_SEEN);
7230     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7231                         >> RXf_PMf_STD_PMMOD_SHIFT);
7232     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7233     char *p;
7234     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7235
7236     /* We output all the necessary flags; we never output a minus, as all
7237         * those are defaults, so are
7238         * covered by the caret */
7239     const STRLEN wraplen = pat_len + has_p + has_runon
7240         + has_default       /* If needs a caret */
7241         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7242
7243             /* If needs a character set specifier */
7244         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7245         + (sizeof("(?:)") - 1);
7246
7247     PERL_ARGS_ASSERT_SET_REGEX_PV;
7248
7249     /* make sure PL_bitcount bounds not exceeded */
7250     assert(sizeof(STD_PAT_MODS) <= 8);
7251
7252     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7253     SvPOK_on(Rx);
7254     if (RExC_utf8)
7255         SvFLAGS(Rx) |= SVf_UTF8;
7256     *p++='('; *p++='?';
7257
7258     /* If a default, cover it using the caret */
7259     if (has_default) {
7260         *p++= DEFAULT_PAT_MOD;
7261     }
7262     if (has_charset) {
7263         STRLEN len;
7264         const char* name;
7265
7266         name = get_regex_charset_name(RExC_rx->extflags, &len);
7267         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7268             assert(RExC_utf8);
7269             name = UNICODE_PAT_MODS;
7270             len = sizeof(UNICODE_PAT_MODS) - 1;
7271         }
7272         Copy(name, p, len, char);
7273         p += len;
7274     }
7275     if (has_p)
7276         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7277     {
7278         char ch;
7279         while((ch = *fptr++)) {
7280             if(reganch & 1)
7281                 *p++ = ch;
7282             reganch >>= 1;
7283         }
7284     }
7285
7286     *p++ = ':';
7287     Copy(RExC_precomp, p, pat_len, char);
7288     assert ((RX_WRAPPED(Rx) - p) < 16);
7289     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7290     p += pat_len;
7291
7292     /* Adding a trailing \n causes this to compile properly:
7293             my $R = qr / A B C # D E/x; /($R)/
7294         Otherwise the parens are considered part of the comment */
7295     if (has_runon)
7296         *p++ = '\n';
7297     *p++ = ')';
7298     *p = 0;
7299     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7300 }
7301
7302 /*
7303  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7304  * regular expression into internal code.
7305  * The pattern may be passed either as:
7306  *    a list of SVs (patternp plus pat_count)
7307  *    a list of OPs (expr)
7308  * If both are passed, the SV list is used, but the OP list indicates
7309  * which SVs are actually pre-compiled code blocks
7310  *
7311  * The SVs in the list have magic and qr overloading applied to them (and
7312  * the list may be modified in-place with replacement SVs in the latter
7313  * case).
7314  *
7315  * If the pattern hasn't changed from old_re, then old_re will be
7316  * returned.
7317  *
7318  * eng is the current engine. If that engine has an op_comp method, then
7319  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7320  * do the initial concatenation of arguments and pass on to the external
7321  * engine.
7322  *
7323  * If is_bare_re is not null, set it to a boolean indicating whether the
7324  * arg list reduced (after overloading) to a single bare regex which has
7325  * been returned (i.e. /$qr/).
7326  *
7327  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7328  *
7329  * pm_flags contains the PMf_* flags, typically based on those from the
7330  * pm_flags field of the related PMOP. Currently we're only interested in
7331  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7332  *
7333  * For many years this code had an initial sizing pass that calculated
7334  * (sometimes incorrectly, leading to security holes) the size needed for the
7335  * compiled pattern.  That was changed by commit
7336  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7337  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7338  * references to this sizing pass.
7339  *
7340  * Now, an initial crude guess as to the size needed is made, based on the
7341  * length of the pattern.  Patches welcome to improve that guess.  That amount
7342  * of space is malloc'd and then immediately freed, and then clawed back node
7343  * by node.  This design is to minimze, to the extent possible, memory churn
7344  * when doing the the reallocs.
7345  *
7346  * A separate parentheses counting pass may be needed in some cases.
7347  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7348  * of these cases.
7349  *
7350  * The existence of a sizing pass necessitated design decisions that are no
7351  * longer needed.  There are potential areas of simplification.
7352  *
7353  * Beware that the optimization-preparation code in here knows about some
7354  * of the structure of the compiled regexp.  [I'll say.]
7355  */
7356
7357 REGEXP *
7358 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7359                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7360                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7361 {
7362     dVAR;
7363     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7364     STRLEN plen;
7365     char *exp;
7366     regnode *scan;
7367     I32 flags;
7368     SSize_t minlen = 0;
7369     U32 rx_flags;
7370     SV *pat;
7371     SV** new_patternp = patternp;
7372
7373     /* these are all flags - maybe they should be turned
7374      * into a single int with different bit masks */
7375     I32 sawlookahead = 0;
7376     I32 sawplus = 0;
7377     I32 sawopen = 0;
7378     I32 sawminmod = 0;
7379
7380     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7381     bool recompile = 0;
7382     bool runtime_code = 0;
7383     scan_data_t data;
7384     RExC_state_t RExC_state;
7385     RExC_state_t * const pRExC_state = &RExC_state;
7386 #ifdef TRIE_STUDY_OPT
7387     int restudied = 0;
7388     RExC_state_t copyRExC_state;
7389 #endif
7390     GET_RE_DEBUG_FLAGS_DECL;
7391
7392     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7393
7394     DEBUG_r(if (!PL_colorset) reginitcolors());
7395
7396     /* Initialize these here instead of as-needed, as is quick and avoids
7397      * having to test them each time otherwise */
7398     if (! PL_InBitmap) {
7399 #ifdef DEBUGGING
7400         char * dump_len_string;
7401 #endif
7402
7403         /* This is calculated here, because the Perl program that generates the
7404          * static global ones doesn't currently have access to
7405          * NUM_ANYOF_CODE_POINTS */
7406         PL_InBitmap = _new_invlist(2);
7407         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7408                                                     NUM_ANYOF_CODE_POINTS - 1);
7409 #ifdef DEBUGGING
7410         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7411         if (   ! dump_len_string
7412             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7413         {
7414             PL_dump_re_max_len = 60;    /* A reasonable default */
7415         }
7416 #endif
7417     }
7418
7419     pRExC_state->warn_text = NULL;
7420     pRExC_state->unlexed_names = NULL;
7421     pRExC_state->code_blocks = NULL;
7422
7423     if (is_bare_re)
7424         *is_bare_re = FALSE;
7425
7426     if (expr && (expr->op_type == OP_LIST ||
7427                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7428         /* allocate code_blocks if needed */
7429         OP *o;
7430         int ncode = 0;
7431
7432         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7433             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7434                 ncode++; /* count of DO blocks */
7435
7436         if (ncode)
7437             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7438     }
7439
7440     if (!pat_count) {
7441         /* compile-time pattern with just OP_CONSTs and DO blocks */
7442
7443         int n;
7444         OP *o;
7445
7446         /* find how many CONSTs there are */
7447         assert(expr);
7448         n = 0;
7449         if (expr->op_type == OP_CONST)
7450             n = 1;
7451         else
7452             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7453                 if (o->op_type == OP_CONST)
7454                     n++;
7455             }
7456
7457         /* fake up an SV array */
7458
7459         assert(!new_patternp);
7460         Newx(new_patternp, n, SV*);
7461         SAVEFREEPV(new_patternp);
7462         pat_count = n;
7463
7464         n = 0;
7465         if (expr->op_type == OP_CONST)
7466             new_patternp[n] = cSVOPx_sv(expr);
7467         else
7468             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7469                 if (o->op_type == OP_CONST)
7470                     new_patternp[n++] = cSVOPo_sv;
7471             }
7472
7473     }
7474
7475     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7476         "Assembling pattern from %d elements%s\n", pat_count,
7477             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7478
7479     /* set expr to the first arg op */
7480
7481     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7482          && expr->op_type != OP_CONST)
7483     {
7484             expr = cLISTOPx(expr)->op_first;
7485             assert(   expr->op_type == OP_PUSHMARK
7486                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7487                    || expr->op_type == OP_PADRANGE);
7488             expr = OpSIBLING(expr);
7489     }
7490
7491     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7492                         expr, &recompile, NULL);
7493
7494     /* handle bare (possibly after overloading) regex: foo =~ $re */
7495     {
7496         SV *re = pat;
7497         if (SvROK(re))
7498             re = SvRV(re);
7499         if (SvTYPE(re) == SVt_REGEXP) {
7500             if (is_bare_re)
7501                 *is_bare_re = TRUE;
7502             SvREFCNT_inc(re);
7503             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7504                 "Precompiled pattern%s\n",
7505                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7506
7507             return (REGEXP*)re;
7508         }
7509     }
7510
7511     exp = SvPV_nomg(pat, plen);
7512
7513     if (!eng->op_comp) {
7514         if ((SvUTF8(pat) && IN_BYTES)
7515                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7516         {
7517             /* make a temporary copy; either to convert to bytes,
7518              * or to avoid repeating get-magic / overloaded stringify */
7519             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7520                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7521         }
7522         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7523     }
7524
7525     /* ignore the utf8ness if the pattern is 0 length */
7526     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7527     RExC_uni_semantics = 0;
7528     RExC_contains_locale = 0;
7529     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7530     RExC_in_script_run = 0;
7531     RExC_study_started = 0;
7532     pRExC_state->runtime_code_qr = NULL;
7533     RExC_frame_head= NULL;
7534     RExC_frame_last= NULL;
7535     RExC_frame_count= 0;
7536     RExC_latest_warn_offset = 0;
7537     RExC_use_BRANCHJ = 0;
7538     RExC_total_parens = 0;
7539     RExC_open_parens = NULL;
7540     RExC_close_parens = NULL;
7541     RExC_paren_names = NULL;
7542     RExC_size = 0;
7543     RExC_seen_d_op = FALSE;
7544 #ifdef DEBUGGING
7545     RExC_paren_name_list = NULL;
7546 #endif
7547
7548     DEBUG_r({
7549         RExC_mysv1= sv_newmortal();
7550         RExC_mysv2= sv_newmortal();
7551     });
7552
7553     DEBUG_COMPILE_r({
7554             SV *dsv= sv_newmortal();
7555             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7556             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7557                           PL_colors[4], PL_colors[5], s);
7558         });
7559
7560     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7561      * to utf8 */
7562
7563     if ((pm_flags & PMf_USE_RE_EVAL)
7564                 /* this second condition covers the non-regex literal case,
7565                  * i.e.  $foo =~ '(?{})'. */
7566                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7567     )
7568         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7569
7570   redo_parse:
7571     /* return old regex if pattern hasn't changed */
7572     /* XXX: note in the below we have to check the flags as well as the
7573      * pattern.
7574      *
7575      * Things get a touch tricky as we have to compare the utf8 flag
7576      * independently from the compile flags.  */
7577
7578     if (   old_re
7579         && !recompile
7580         && !!RX_UTF8(old_re) == !!RExC_utf8
7581         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7582         && RX_PRECOMP(old_re)
7583         && RX_PRELEN(old_re) == plen
7584         && memEQ(RX_PRECOMP(old_re), exp, plen)
7585         && !runtime_code /* with runtime code, always recompile */ )
7586     {
7587         return old_re;
7588     }
7589
7590     /* Allocate the pattern's SV */
7591     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7592     RExC_rx = ReANY(Rx);
7593     if ( RExC_rx == NULL )
7594         FAIL("Regexp out of space");
7595
7596     rx_flags = orig_rx_flags;
7597
7598     if (   (UTF || RExC_uni_semantics)
7599         && initial_charset == REGEX_DEPENDS_CHARSET)
7600     {
7601
7602         /* Set to use unicode semantics if the pattern is in utf8 and has the
7603          * 'depends' charset specified, as it means unicode when utf8  */
7604         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7605         RExC_uni_semantics = 1;
7606     }
7607
7608     RExC_pm_flags = pm_flags;
7609
7610     if (runtime_code) {
7611         assert(TAINTING_get || !TAINT_get);
7612         if (TAINT_get)
7613             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7614
7615         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7616             /* whoops, we have a non-utf8 pattern, whilst run-time code
7617              * got compiled as utf8. Try again with a utf8 pattern */
7618             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7619                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7620             goto redo_parse;
7621         }
7622     }
7623     assert(!pRExC_state->runtime_code_qr);
7624
7625     RExC_sawback = 0;
7626
7627     RExC_seen = 0;
7628     RExC_maxlen = 0;
7629     RExC_in_lookbehind = 0;
7630     RExC_in_lookahead = 0;
7631     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7632     RExC_recode_x_to_native = 0;
7633     RExC_in_multi_char_class = 0;
7634
7635     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7636     RExC_precomp_end = RExC_end = exp + plen;
7637     RExC_nestroot = 0;
7638     RExC_whilem_seen = 0;
7639     RExC_end_op = NULL;
7640     RExC_recurse = NULL;
7641     RExC_study_chunk_recursed = NULL;
7642     RExC_study_chunk_recursed_bytes= 0;
7643     RExC_recurse_count = 0;
7644     pRExC_state->code_index = 0;
7645
7646     /* Initialize the string in the compiled pattern.  This is so that there is
7647      * something to output if necessary */
7648     set_regex_pv(pRExC_state, Rx);
7649
7650     DEBUG_PARSE_r({
7651         Perl_re_printf( aTHX_
7652             "Starting parse and generation\n");
7653         RExC_lastnum=0;
7654         RExC_lastparse=NULL;
7655     });
7656
7657     /* Allocate space and zero-initialize. Note, the two step process
7658        of zeroing when in debug mode, thus anything assigned has to
7659        happen after that */
7660     if (!  RExC_size) {
7661
7662         /* On the first pass of the parse, we guess how big this will be.  Then
7663          * we grow in one operation to that amount and then give it back.  As
7664          * we go along, we re-allocate what we need.
7665          *
7666          * XXX Currently the guess is essentially that the pattern will be an
7667          * EXACT node with one byte input, one byte output.  This is crude, and
7668          * better heuristics are welcome.
7669          *
7670          * On any subsequent passes, we guess what we actually computed in the
7671          * latest earlier pass.  Such a pass probably didn't complete so is
7672          * missing stuff.  We could improve those guesses by knowing where the
7673          * parse stopped, and use the length so far plus apply the above
7674          * assumption to what's left. */
7675         RExC_size = STR_SZ(RExC_end - RExC_start);
7676     }
7677
7678     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7679     if ( RExC_rxi == NULL )
7680         FAIL("Regexp out of space");
7681
7682     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7683     RXi_SET( RExC_rx, RExC_rxi );
7684
7685     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7686      * node parsed will give back any excess memory we have allocated so far).
7687      * */
7688     RExC_size = 0;
7689
7690     /* non-zero initialization begins here */
7691     RExC_rx->engine= eng;
7692     RExC_rx->extflags = rx_flags;
7693     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7694
7695     if (pm_flags & PMf_IS_QR) {
7696         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7697         if (RExC_rxi->code_blocks) {
7698             RExC_rxi->code_blocks->refcnt++;
7699         }
7700     }
7701
7702     RExC_rx->intflags = 0;
7703
7704     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7705     RExC_parse = exp;
7706
7707     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7708      * code makes sure the final byte is an uncounted NUL.  But should this
7709      * ever not be the case, lots of things could read beyond the end of the
7710      * buffer: loops like
7711      *      while(isFOO(*RExC_parse)) RExC_parse++;
7712      *      strchr(RExC_parse, "foo");
7713      * etc.  So it is worth noting. */
7714     assert(*RExC_end == '\0');
7715
7716     RExC_naughty = 0;
7717     RExC_npar = 1;
7718     RExC_parens_buf_size = 0;
7719     RExC_emit_start = RExC_rxi->program;
7720     pRExC_state->code_index = 0;
7721
7722     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7723     RExC_emit = 1;
7724
7725     /* Do the parse */
7726     if (reg(pRExC_state, 0, &flags, 1)) {
7727
7728         /* Success!, But we may need to redo the parse knowing how many parens
7729          * there actually are */
7730         if (IN_PARENS_PASS) {
7731             flags |= RESTART_PARSE;
7732         }
7733
7734         /* We have that number in RExC_npar */
7735         RExC_total_parens = RExC_npar;
7736     }
7737     else if (! MUST_RESTART(flags)) {
7738         ReREFCNT_dec(Rx);
7739         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7740     }
7741
7742     /* Here, we either have success, or we have to redo the parse for some reason */
7743     if (MUST_RESTART(flags)) {
7744
7745         /* It's possible to write a regexp in ascii that represents Unicode
7746         codepoints outside of the byte range, such as via \x{100}. If we
7747         detect such a sequence we have to convert the entire pattern to utf8
7748         and then recompile, as our sizing calculation will have been based
7749         on 1 byte == 1 character, but we will need to use utf8 to encode
7750         at least some part of the pattern, and therefore must convert the whole
7751         thing.
7752         -- dmq */
7753         if (flags & NEED_UTF8) {
7754
7755             /* We have stored the offset of the final warning output so far.
7756              * That must be adjusted.  Any variant characters between the start
7757              * of the pattern and this warning count for 2 bytes in the final,
7758              * so just add them again */
7759             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7760                 RExC_latest_warn_offset +=
7761                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7762                                                 + RExC_latest_warn_offset);
7763             }
7764             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7765             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7766             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7767         }
7768         else {
7769             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7770         }
7771
7772         if (ALL_PARENS_COUNTED) {
7773             /* Make enough room for all the known parens, and zero it */
7774             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7775             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7776             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7777
7778             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7779             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7780         }
7781         else { /* Parse did not complete.  Reinitialize the parentheses
7782                   structures */
7783             RExC_total_parens = 0;
7784             if (RExC_open_parens) {
7785                 Safefree(RExC_open_parens);
7786                 RExC_open_parens = NULL;
7787             }
7788             if (RExC_close_parens) {
7789                 Safefree(RExC_close_parens);
7790                 RExC_close_parens = NULL;
7791             }
7792         }
7793
7794         /* Clean up what we did in this parse */
7795         SvREFCNT_dec_NN(RExC_rx_sv);
7796
7797         goto redo_parse;
7798     }
7799
7800     /* Here, we have successfully parsed and generated the pattern's program
7801      * for the regex engine.  We are ready to finish things up and look for
7802      * optimizations. */
7803
7804     /* Update the string to compile, with correct modifiers, etc */
7805     set_regex_pv(pRExC_state, Rx);
7806
7807     RExC_rx->nparens = RExC_total_parens - 1;
7808
7809     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7810     if (RExC_whilem_seen > 15)
7811         RExC_whilem_seen = 15;
7812
7813     DEBUG_PARSE_r({
7814         Perl_re_printf( aTHX_
7815             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7816         RExC_lastnum=0;
7817         RExC_lastparse=NULL;
7818     });
7819
7820 #ifdef RE_TRACK_PATTERN_OFFSETS
7821     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7822                           "%s %" UVuf " bytes for offset annotations.\n",
7823                           RExC_offsets ? "Got" : "Couldn't get",
7824                           (UV)((RExC_offsets[0] * 2 + 1))));
7825     DEBUG_OFFSETS_r(if (RExC_offsets) {
7826         const STRLEN len = RExC_offsets[0];
7827         STRLEN i;
7828         GET_RE_DEBUG_FLAGS_DECL;
7829         Perl_re_printf( aTHX_
7830                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7831         for (i = 1; i <= len; i++) {
7832             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7833                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7834                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7835         }
7836         Perl_re_printf( aTHX_  "\n");
7837     });
7838
7839 #else
7840     SetProgLen(RExC_rxi,RExC_size);
7841 #endif
7842
7843     DEBUG_DUMP_PRE_OPTIMIZE_r({
7844         SV * const sv = sv_newmortal();
7845         RXi_GET_DECL(RExC_rx, ri);
7846         DEBUG_RExC_seen();
7847         Perl_re_printf( aTHX_ "Program before optimization:\n");
7848
7849         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7850                         sv, 0, 0);
7851     });
7852
7853     DEBUG_OPTIMISE_r(
7854         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7855     );
7856
7857     /* XXXX To minimize changes to RE engine we always allocate
7858        3-units-long substrs field. */
7859     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7860     if (RExC_recurse_count) {
7861         Newx(RExC_recurse, RExC_recurse_count, regnode *);
7862         SAVEFREEPV(RExC_recurse);
7863     }
7864
7865     if (RExC_seen & REG_RECURSE_SEEN) {
7866         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7867          * So its 1 if there are no parens. */
7868         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7869                                          ((RExC_total_parens & 0x07) != 0);
7870         Newx(RExC_study_chunk_recursed,
7871              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7872         SAVEFREEPV(RExC_study_chunk_recursed);
7873     }
7874
7875   reStudy:
7876     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7877     DEBUG_r(
7878         RExC_study_chunk_recursed_count= 0;
7879     );
7880     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7881     if (RExC_study_chunk_recursed) {
7882         Zero(RExC_study_chunk_recursed,
7883              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7884     }
7885
7886
7887 #ifdef TRIE_STUDY_OPT
7888     if (!restudied) {
7889         StructCopy(&zero_scan_data, &data, scan_data_t);
7890         copyRExC_state = RExC_state;
7891     } else {
7892         U32 seen=RExC_seen;
7893         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7894
7895         RExC_state = copyRExC_state;
7896         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7897             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7898         else
7899             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7900         StructCopy(&zero_scan_data, &data, scan_data_t);
7901     }
7902 #else
7903     StructCopy(&zero_scan_data, &data, scan_data_t);
7904 #endif
7905
7906     /* Dig out information for optimizations. */
7907     RExC_rx->extflags = RExC_flags; /* was pm_op */
7908     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7909
7910     if (UTF)
7911         SvUTF8_on(Rx);  /* Unicode in it? */
7912     RExC_rxi->regstclass = NULL;
7913     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7914         RExC_rx->intflags |= PREGf_NAUGHTY;
7915     scan = RExC_rxi->program + 1;               /* First BRANCH. */
7916
7917     /* testing for BRANCH here tells us whether there is "must appear"
7918        data in the pattern. If there is then we can use it for optimisations */
7919     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7920                                                   */
7921         SSize_t fake;
7922         STRLEN longest_length[2];
7923         regnode_ssc ch_class; /* pointed to by data */
7924         int stclass_flag;
7925         SSize_t last_close = 0; /* pointed to by data */
7926         regnode *first= scan;
7927         regnode *first_next= regnext(first);
7928         int i;
7929
7930         /*
7931          * Skip introductions and multiplicators >= 1
7932          * so that we can extract the 'meat' of the pattern that must
7933          * match in the large if() sequence following.
7934          * NOTE that EXACT is NOT covered here, as it is normally
7935          * picked up by the optimiser separately.
7936          *
7937          * This is unfortunate as the optimiser isnt handling lookahead
7938          * properly currently.
7939          *
7940          */
7941         while ((OP(first) == OPEN && (sawopen = 1)) ||
7942                /* An OR of *one* alternative - should not happen now. */
7943             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7944             /* for now we can't handle lookbehind IFMATCH*/
7945             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7946             (OP(first) == PLUS) ||
7947             (OP(first) == MINMOD) ||
7948                /* An {n,m} with n>0 */
7949             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7950             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7951         {
7952                 /*
7953                  * the only op that could be a regnode is PLUS, all the rest
7954                  * will be regnode_1 or regnode_2.
7955                  *
7956                  * (yves doesn't think this is true)
7957                  */
7958                 if (OP(first) == PLUS)
7959                     sawplus = 1;
7960                 else {
7961                     if (OP(first) == MINMOD)
7962                         sawminmod = 1;
7963                     first += regarglen[OP(first)];
7964                 }
7965                 first = NEXTOPER(first);
7966                 first_next= regnext(first);
7967         }
7968
7969         /* Starting-point info. */
7970       again:
7971         DEBUG_PEEP("first:", first, 0, 0);
7972         /* Ignore EXACT as we deal with it later. */
7973         if (PL_regkind[OP(first)] == EXACT) {
7974             if (   OP(first) == EXACT
7975                 || OP(first) == EXACT_ONLY8
7976                 || OP(first) == EXACTL)
7977             {
7978                 NOOP;   /* Empty, get anchored substr later. */
7979             }
7980             else
7981                 RExC_rxi->regstclass = first;
7982         }
7983 #ifdef TRIE_STCLASS
7984         else if (PL_regkind[OP(first)] == TRIE &&
7985                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
7986         {
7987             /* this can happen only on restudy */
7988             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7989         }
7990 #endif
7991         else if (REGNODE_SIMPLE(OP(first)))
7992             RExC_rxi->regstclass = first;
7993         else if (PL_regkind[OP(first)] == BOUND ||
7994                  PL_regkind[OP(first)] == NBOUND)
7995             RExC_rxi->regstclass = first;
7996         else if (PL_regkind[OP(first)] == BOL) {
7997             RExC_rx->intflags |= (OP(first) == MBOL
7998                            ? PREGf_ANCH_MBOL
7999                            : PREGf_ANCH_SBOL);
8000             first = NEXTOPER(first);
8001             goto again;
8002         }
8003         else if (OP(first) == GPOS) {
8004             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8005             first = NEXTOPER(first);
8006             goto again;
8007         }
8008         else if ((!sawopen || !RExC_sawback) &&
8009             !sawlookahead &&
8010             (OP(first) == STAR &&
8011             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8012             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8013         {
8014             /* turn .* into ^.* with an implied $*=1 */
8015             const int type =
8016                 (OP(NEXTOPER(first)) == REG_ANY)
8017                     ? PREGf_ANCH_MBOL
8018                     : PREGf_ANCH_SBOL;
8019             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8020             first = NEXTOPER(first);
8021             goto again;
8022         }
8023         if (sawplus && !sawminmod && !sawlookahead
8024             && (!sawopen || !RExC_sawback)
8025             && !pRExC_state->code_blocks) /* May examine pos and $& */
8026             /* x+ must match at the 1st pos of run of x's */
8027             RExC_rx->intflags |= PREGf_SKIP;
8028
8029         /* Scan is after the zeroth branch, first is atomic matcher. */
8030 #ifdef TRIE_STUDY_OPT
8031         DEBUG_PARSE_r(
8032             if (!restudied)
8033                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8034                               (IV)(first - scan + 1))
8035         );
8036 #else
8037         DEBUG_PARSE_r(
8038             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8039                 (IV)(first - scan + 1))
8040         );
8041 #endif
8042
8043
8044         /*
8045         * If there's something expensive in the r.e., find the
8046         * longest literal string that must appear and make it the
8047         * regmust.  Resolve ties in favor of later strings, since
8048         * the regstart check works with the beginning of the r.e.
8049         * and avoiding duplication strengthens checking.  Not a
8050         * strong reason, but sufficient in the absence of others.
8051         * [Now we resolve ties in favor of the earlier string if
8052         * it happens that c_offset_min has been invalidated, since the
8053         * earlier string may buy us something the later one won't.]
8054         */
8055
8056         data.substrs[0].str = newSVpvs("");
8057         data.substrs[1].str = newSVpvs("");
8058         data.last_found = newSVpvs("");
8059         data.cur_is_floating = 0; /* initially any found substring is fixed */
8060         ENTER_with_name("study_chunk");
8061         SAVEFREESV(data.substrs[0].str);
8062         SAVEFREESV(data.substrs[1].str);
8063         SAVEFREESV(data.last_found);
8064         first = scan;
8065         if (!RExC_rxi->regstclass) {
8066             ssc_init(pRExC_state, &ch_class);
8067             data.start_class = &ch_class;
8068             stclass_flag = SCF_DO_STCLASS_AND;
8069         } else                          /* XXXX Check for BOUND? */
8070             stclass_flag = 0;
8071         data.last_closep = &last_close;
8072
8073         DEBUG_RExC_seen();
8074         /*
8075          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8076          * (NO top level branches)
8077          */
8078         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8079                              scan + RExC_size, /* Up to end */
8080             &data, -1, 0, NULL,
8081             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8082                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8083             0);
8084
8085
8086         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8087
8088
8089         if ( RExC_total_parens == 1 && !data.cur_is_floating
8090              && data.last_start_min == 0 && data.last_end > 0
8091              && !RExC_seen_zerolen
8092              && !(RExC_seen & REG_VERBARG_SEEN)
8093              && !(RExC_seen & REG_GPOS_SEEN)
8094         ){
8095             RExC_rx->extflags |= RXf_CHECK_ALL;
8096         }
8097         scan_commit(pRExC_state, &data,&minlen, 0);
8098
8099
8100         /* XXX this is done in reverse order because that's the way the
8101          * code was before it was parameterised. Don't know whether it
8102          * actually needs doing in reverse order. DAPM */
8103         for (i = 1; i >= 0; i--) {
8104             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8105
8106             if (   !(   i
8107                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8108                      &&    data.substrs[0].min_offset
8109                         == data.substrs[1].min_offset
8110                      &&    SvCUR(data.substrs[0].str)
8111                         == SvCUR(data.substrs[1].str)
8112                     )
8113                 && S_setup_longest (aTHX_ pRExC_state,
8114                                         &(RExC_rx->substrs->data[i]),
8115                                         &(data.substrs[i]),
8116                                         longest_length[i]))
8117             {
8118                 RExC_rx->substrs->data[i].min_offset =
8119                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8120
8121                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8122                 /* Don't offset infinity */
8123                 if (data.substrs[i].max_offset < SSize_t_MAX)
8124                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8125                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8126             }
8127             else {
8128                 RExC_rx->substrs->data[i].substr      = NULL;
8129                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8130                 longest_length[i] = 0;
8131             }
8132         }
8133
8134         LEAVE_with_name("study_chunk");
8135
8136         if (RExC_rxi->regstclass
8137             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8138             RExC_rxi->regstclass = NULL;
8139
8140         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8141               || RExC_rx->substrs->data[0].min_offset)
8142             && stclass_flag
8143             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8144             && is_ssc_worth_it(pRExC_state, data.start_class))
8145         {
8146             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8147
8148             ssc_finalize(pRExC_state, data.start_class);
8149
8150             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8151             StructCopy(data.start_class,
8152                        (regnode_ssc*)RExC_rxi->data->data[n],
8153                        regnode_ssc);
8154             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8155             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8156             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8157                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8158                       Perl_re_printf( aTHX_
8159                                     "synthetic stclass \"%s\".\n",
8160                                     SvPVX_const(sv));});
8161             data.start_class = NULL;
8162         }
8163
8164         /* A temporary algorithm prefers floated substr to fixed one of
8165          * same length to dig more info. */
8166         i = (longest_length[0] <= longest_length[1]);
8167         RExC_rx->substrs->check_ix = i;
8168         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8169         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8170         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8171         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8172         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8173         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8174             RExC_rx->intflags |= PREGf_NOSCAN;
8175
8176         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8177             RExC_rx->extflags |= RXf_USE_INTUIT;
8178             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8179                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8180         }
8181
8182         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8183         if ( (STRLEN)minlen < longest_length[1] )
8184             minlen= longest_length[1];
8185         if ( (STRLEN)minlen < longest_length[0] )
8186             minlen= longest_length[0];
8187         */
8188     }
8189     else {
8190         /* Several toplevels. Best we can is to set minlen. */
8191         SSize_t fake;
8192         regnode_ssc ch_class;
8193         SSize_t last_close = 0;
8194
8195         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8196
8197         scan = RExC_rxi->program + 1;
8198         ssc_init(pRExC_state, &ch_class);
8199         data.start_class = &ch_class;
8200         data.last_closep = &last_close;
8201
8202         DEBUG_RExC_seen();
8203         /*
8204          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8205          * (patterns WITH top level branches)
8206          */
8207         minlen = study_chunk(pRExC_state,
8208             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8209             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8210                                                       ? SCF_TRIE_DOING_RESTUDY
8211                                                       : 0),
8212             0);
8213
8214         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8215
8216         RExC_rx->check_substr = NULL;
8217         RExC_rx->check_utf8 = NULL;
8218         RExC_rx->substrs->data[0].substr      = NULL;
8219         RExC_rx->substrs->data[0].utf8_substr = NULL;
8220         RExC_rx->substrs->data[1].substr      = NULL;
8221         RExC_rx->substrs->data[1].utf8_substr = NULL;
8222
8223         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8224             && is_ssc_worth_it(pRExC_state, data.start_class))
8225         {
8226             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8227
8228             ssc_finalize(pRExC_state, data.start_class);
8229
8230             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8231             StructCopy(data.start_class,
8232                        (regnode_ssc*)RExC_rxi->data->data[n],
8233                        regnode_ssc);
8234             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8235             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8236             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8237                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8238                       Perl_re_printf( aTHX_
8239                                     "synthetic stclass \"%s\".\n",
8240                                     SvPVX_const(sv));});
8241             data.start_class = NULL;
8242         }
8243     }
8244
8245     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8246         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8247         RExC_rx->maxlen = REG_INFTY;
8248     }
8249     else {
8250         RExC_rx->maxlen = RExC_maxlen;
8251     }
8252
8253     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8254        the "real" pattern. */
8255     DEBUG_OPTIMISE_r({
8256         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8257                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8258     });
8259     RExC_rx->minlenret = minlen;
8260     if (RExC_rx->minlen < minlen)
8261         RExC_rx->minlen = minlen;
8262
8263     if (RExC_seen & REG_RECURSE_SEEN ) {
8264         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8265         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8266     }
8267     if (RExC_seen & REG_GPOS_SEEN)
8268         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8269     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8270         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8271                                                 lookbehind */
8272     if (pRExC_state->code_blocks)
8273         RExC_rx->extflags |= RXf_EVAL_SEEN;
8274     if (RExC_seen & REG_VERBARG_SEEN)
8275     {
8276         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8277         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8278     }
8279     if (RExC_seen & REG_CUTGROUP_SEEN)
8280         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8281     if (pm_flags & PMf_USE_RE_EVAL)
8282         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8283     if (RExC_paren_names)
8284         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8285     else
8286         RXp_PAREN_NAMES(RExC_rx) = NULL;
8287
8288     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8289      * so it can be used in pp.c */
8290     if (RExC_rx->intflags & PREGf_ANCH)
8291         RExC_rx->extflags |= RXf_IS_ANCHORED;
8292
8293
8294     {
8295         /* this is used to identify "special" patterns that might result
8296          * in Perl NOT calling the regex engine and instead doing the match "itself",
8297          * particularly special cases in split//. By having the regex compiler
8298          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8299          * we avoid weird issues with equivalent patterns resulting in different behavior,
8300          * AND we allow non Perl engines to get the same optimizations by the setting the
8301          * flags appropriately - Yves */
8302         regnode *first = RExC_rxi->program + 1;
8303         U8 fop = OP(first);
8304         regnode *next = regnext(first);
8305         U8 nop = OP(next);
8306
8307         if (PL_regkind[fop] == NOTHING && nop == END)
8308             RExC_rx->extflags |= RXf_NULL;
8309         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8310             /* when fop is SBOL first->flags will be true only when it was
8311              * produced by parsing /\A/, and not when parsing /^/. This is
8312              * very important for the split code as there we want to
8313              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8314              * See rt #122761 for more details. -- Yves */
8315             RExC_rx->extflags |= RXf_START_ONLY;
8316         else if (fop == PLUS
8317                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8318                  && nop == END)
8319             RExC_rx->extflags |= RXf_WHITE;
8320         else if ( RExC_rx->extflags & RXf_SPLIT
8321                   && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
8322                   && STR_LEN(first) == 1
8323                   && *(STRING(first)) == ' '
8324                   && nop == END )
8325             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8326
8327     }
8328
8329     if (RExC_contains_locale) {
8330         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8331     }
8332
8333 #ifdef DEBUGGING
8334     if (RExC_paren_names) {
8335         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8336         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8337                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8338     } else
8339 #endif
8340     RExC_rxi->name_list_idx = 0;
8341
8342     while ( RExC_recurse_count > 0 ) {
8343         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8344         /*
8345          * This data structure is set up in study_chunk() and is used
8346          * to calculate the distance between a GOSUB regopcode and
8347          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8348          * it refers to.
8349          *
8350          * If for some reason someone writes code that optimises
8351          * away a GOSUB opcode then the assert should be changed to
8352          * an if(scan) to guard the ARG2L_SET() - Yves
8353          *
8354          */
8355         assert(scan && OP(scan) == GOSUB);
8356         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8357     }
8358
8359     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8360     /* assume we don't need to swap parens around before we match */
8361     DEBUG_TEST_r({
8362         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8363             (unsigned long)RExC_study_chunk_recursed_count);
8364     });
8365     DEBUG_DUMP_r({
8366         DEBUG_RExC_seen();
8367         Perl_re_printf( aTHX_ "Final program:\n");
8368         regdump(RExC_rx);
8369     });
8370
8371     if (RExC_open_parens) {
8372         Safefree(RExC_open_parens);
8373         RExC_open_parens = NULL;
8374     }
8375     if (RExC_close_parens) {
8376         Safefree(RExC_close_parens);
8377         RExC_close_parens = NULL;
8378     }
8379
8380 #ifdef USE_ITHREADS
8381     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8382      * by setting the regexp SV to readonly-only instead. If the
8383      * pattern's been recompiled, the USEDness should remain. */
8384     if (old_re && SvREADONLY(old_re))
8385         SvREADONLY_on(Rx);
8386 #endif
8387     return Rx;
8388 }
8389
8390
8391 SV*
8392 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8393                     const U32 flags)
8394 {
8395     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8396
8397     PERL_UNUSED_ARG(value);
8398
8399     if (flags & RXapif_FETCH) {
8400         return reg_named_buff_fetch(rx, key, flags);
8401     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8402         Perl_croak_no_modify();
8403         return NULL;
8404     } else if (flags & RXapif_EXISTS) {
8405         return reg_named_buff_exists(rx, key, flags)
8406             ? &PL_sv_yes
8407             : &PL_sv_no;
8408     } else if (flags & RXapif_REGNAMES) {
8409         return reg_named_buff_all(rx, flags);
8410     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8411         return reg_named_buff_scalar(rx, flags);
8412     } else {
8413         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8414         return NULL;
8415     }
8416 }
8417
8418 SV*
8419 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8420                          const U32 flags)
8421 {
8422     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8423     PERL_UNUSED_ARG(lastkey);
8424
8425     if (flags & RXapif_FIRSTKEY)
8426         return reg_named_buff_firstkey(rx, flags);
8427     else if (flags & RXapif_NEXTKEY)
8428         return reg_named_buff_nextkey(rx, flags);
8429     else {
8430         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8431                                             (int)flags);
8432         return NULL;
8433     }
8434 }
8435
8436 SV*
8437 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8438                           const U32 flags)
8439 {
8440     SV *ret;
8441     struct regexp *const rx = ReANY(r);
8442
8443     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8444
8445     if (rx && RXp_PAREN_NAMES(rx)) {
8446         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8447         if (he_str) {
8448             IV i;
8449             SV* sv_dat=HeVAL(he_str);
8450             I32 *nums=(I32*)SvPVX(sv_dat);
8451             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8452             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8453                 if ((I32)(rx->nparens) >= nums[i]
8454                     && rx->offs[nums[i]].start != -1
8455                     && rx->offs[nums[i]].end != -1)
8456                 {
8457                     ret = newSVpvs("");
8458                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8459                     if (!retarray)
8460                         return ret;
8461                 } else {
8462                     if (retarray)
8463                         ret = newSVsv(&PL_sv_undef);
8464                 }
8465                 if (retarray)
8466                     av_push(retarray, ret);
8467             }
8468             if (retarray)
8469                 return newRV_noinc(MUTABLE_SV(retarray));
8470         }
8471     }
8472     return NULL;
8473 }
8474
8475 bool
8476 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8477                            const U32 flags)
8478 {
8479     struct regexp *const rx = ReANY(r);
8480
8481     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8482
8483     if (rx && RXp_PAREN_NAMES(rx)) {
8484         if (flags & RXapif_ALL) {
8485             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8486         } else {
8487             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8488             if (sv) {
8489                 SvREFCNT_dec_NN(sv);
8490                 return TRUE;
8491             } else {
8492                 return FALSE;
8493             }
8494         }
8495     } else {
8496         return FALSE;
8497     }
8498 }
8499
8500 SV*
8501 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8502 {
8503     struct regexp *const rx = ReANY(r);
8504
8505     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8506
8507     if ( rx && RXp_PAREN_NAMES(rx) ) {
8508         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8509
8510         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8511     } else {
8512         return FALSE;
8513     }
8514 }
8515
8516 SV*
8517 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8518 {
8519     struct regexp *const rx = ReANY(r);
8520     GET_RE_DEBUG_FLAGS_DECL;
8521
8522     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8523
8524     if (rx && RXp_PAREN_NAMES(rx)) {
8525         HV *hv = RXp_PAREN_NAMES(rx);
8526         HE *temphe;
8527         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8528             IV i;
8529             IV parno = 0;
8530             SV* sv_dat = HeVAL(temphe);
8531             I32 *nums = (I32*)SvPVX(sv_dat);
8532             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8533                 if ((I32)(rx->lastparen) >= nums[i] &&
8534                     rx->offs[nums[i]].start != -1 &&
8535                     rx->offs[nums[i]].end != -1)
8536                 {
8537                     parno = nums[i];
8538                     break;
8539                 }
8540             }
8541             if (parno || flags & RXapif_ALL) {
8542                 return newSVhek(HeKEY_hek(temphe));
8543             }
8544         }
8545     }
8546     return NULL;
8547 }
8548
8549 SV*
8550 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8551 {
8552     SV *ret;
8553     AV *av;
8554     SSize_t length;
8555     struct regexp *const rx = ReANY(r);
8556
8557     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8558
8559     if (rx && RXp_PAREN_NAMES(rx)) {
8560         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8561             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8562         } else if (flags & RXapif_ONE) {
8563             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8564             av = MUTABLE_AV(SvRV(ret));
8565             length = av_tindex(av);
8566             SvREFCNT_dec_NN(ret);
8567             return newSViv(length + 1);
8568         } else {
8569             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8570                                                 (int)flags);
8571             return NULL;
8572         }
8573     }
8574     return &PL_sv_undef;
8575 }
8576
8577 SV*
8578 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8579 {
8580     struct regexp *const rx = ReANY(r);
8581     AV *av = newAV();
8582
8583     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8584
8585     if (rx && RXp_PAREN_NAMES(rx)) {
8586         HV *hv= RXp_PAREN_NAMES(rx);
8587         HE *temphe;
8588         (void)hv_iterinit(hv);
8589         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8590             IV i;
8591             IV parno = 0;
8592             SV* sv_dat = HeVAL(temphe);
8593             I32 *nums = (I32*)SvPVX(sv_dat);
8594             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8595                 if ((I32)(rx->lastparen) >= nums[i] &&
8596                     rx->offs[nums[i]].start != -1 &&
8597                     rx->offs[nums[i]].end != -1)
8598                 {
8599                     parno = nums[i];
8600                     break;
8601                 }
8602             }
8603             if (parno || flags & RXapif_ALL) {
8604                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8605             }
8606         }
8607     }
8608
8609     return newRV_noinc(MUTABLE_SV(av));
8610 }
8611
8612 void
8613 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8614                              SV * const sv)
8615 {
8616     struct regexp *const rx = ReANY(r);
8617     char *s = NULL;
8618     SSize_t i = 0;
8619     SSize_t s1, t1;
8620     I32 n = paren;
8621
8622     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8623
8624     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8625            || n == RX_BUFF_IDX_CARET_FULLMATCH
8626            || n == RX_BUFF_IDX_CARET_POSTMATCH
8627        )
8628     {
8629         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8630         if (!keepcopy) {
8631             /* on something like
8632              *    $r = qr/.../;
8633              *    /$qr/p;
8634              * the KEEPCOPY is set on the PMOP rather than the regex */
8635             if (PL_curpm && r == PM_GETRE(PL_curpm))
8636                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8637         }
8638         if (!keepcopy)
8639             goto ret_undef;
8640     }
8641
8642     if (!rx->subbeg)
8643         goto ret_undef;
8644
8645     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8646         /* no need to distinguish between them any more */
8647         n = RX_BUFF_IDX_FULLMATCH;
8648
8649     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8650         && rx->offs[0].start != -1)
8651     {
8652         /* $`, ${^PREMATCH} */
8653         i = rx->offs[0].start;
8654         s = rx->subbeg;
8655     }
8656     else
8657     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8658         && rx->offs[0].end != -1)
8659     {
8660         /* $', ${^POSTMATCH} */
8661         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8662         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8663     }
8664     else
8665     if ( 0 <= n && n <= (I32)rx->nparens &&
8666         (s1 = rx->offs[n].start) != -1 &&
8667         (t1 = rx->offs[n].end) != -1)
8668     {
8669         /* $&, ${^MATCH},  $1 ... */
8670         i = t1 - s1;
8671         s = rx->subbeg + s1 - rx->suboffset;
8672     } else {
8673         goto ret_undef;
8674     }
8675
8676     assert(s >= rx->subbeg);
8677     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8678     if (i >= 0) {
8679 #ifdef NO_TAINT_SUPPORT
8680         sv_setpvn(sv, s, i);
8681 #else
8682         const int oldtainted = TAINT_get;
8683         TAINT_NOT;
8684         sv_setpvn(sv, s, i);
8685         TAINT_set(oldtainted);
8686 #endif
8687         if (RXp_MATCH_UTF8(rx))
8688             SvUTF8_on(sv);
8689         else
8690             SvUTF8_off(sv);
8691         if (TAINTING_get) {
8692             if (RXp_MATCH_TAINTED(rx)) {
8693                 if (SvTYPE(sv) >= SVt_PVMG) {
8694                     MAGIC* const mg = SvMAGIC(sv);
8695                     MAGIC* mgt;
8696                     TAINT;
8697                     SvMAGIC_set(sv, mg->mg_moremagic);
8698                     SvTAINT(sv);
8699                     if ((mgt = SvMAGIC(sv))) {
8700                         mg->mg_moremagic = mgt;
8701                         SvMAGIC_set(sv, mg);
8702                     }
8703                 } else {
8704                     TAINT;
8705                     SvTAINT(sv);
8706                 }
8707             } else
8708                 SvTAINTED_off(sv);
8709         }
8710     } else {
8711       ret_undef:
8712         sv_set_undef(sv);
8713         return;
8714     }
8715 }
8716
8717 void
8718 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8719                                                          SV const * const value)
8720 {
8721     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8722
8723     PERL_UNUSED_ARG(rx);
8724     PERL_UNUSED_ARG(paren);
8725     PERL_UNUSED_ARG(value);
8726
8727     if (!PL_localizing)
8728         Perl_croak_no_modify();
8729 }
8730
8731 I32
8732 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8733                               const I32 paren)
8734 {
8735     struct regexp *const rx = ReANY(r);
8736     I32 i;
8737     I32 s1, t1;
8738
8739     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8740
8741     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8742         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8743         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8744     )
8745     {
8746         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8747         if (!keepcopy) {
8748             /* on something like
8749              *    $r = qr/.../;
8750              *    /$qr/p;
8751              * the KEEPCOPY is set on the PMOP rather than the regex */
8752             if (PL_curpm && r == PM_GETRE(PL_curpm))
8753                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8754         }
8755         if (!keepcopy)
8756             goto warn_undef;
8757     }
8758
8759     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8760     switch (paren) {
8761       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8762       case RX_BUFF_IDX_PREMATCH:       /* $` */
8763         if (rx->offs[0].start != -1) {
8764                         i = rx->offs[0].start;
8765                         if (i > 0) {
8766                                 s1 = 0;
8767                                 t1 = i;
8768                                 goto getlen;
8769                         }
8770             }
8771         return 0;
8772
8773       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8774       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8775             if (rx->offs[0].end != -1) {
8776                         i = rx->sublen - rx->offs[0].end;
8777                         if (i > 0) {
8778                                 s1 = rx->offs[0].end;
8779                                 t1 = rx->sublen;
8780                                 goto getlen;
8781                         }
8782             }
8783         return 0;
8784
8785       default: /* $& / ${^MATCH}, $1, $2, ... */
8786             if (paren <= (I32)rx->nparens &&
8787             (s1 = rx->offs[paren].start) != -1 &&
8788             (t1 = rx->offs[paren].end) != -1)
8789             {
8790             i = t1 - s1;
8791             goto getlen;
8792         } else {
8793           warn_undef:
8794             if (ckWARN(WARN_UNINITIALIZED))
8795                 report_uninit((const SV *)sv);
8796             return 0;
8797         }
8798     }
8799   getlen:
8800     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8801         const char * const s = rx->subbeg - rx->suboffset + s1;
8802         const U8 *ep;
8803         STRLEN el;
8804
8805         i = t1 - s1;
8806         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8807                         i = el;
8808     }
8809     return i;
8810 }
8811
8812 SV*
8813 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8814 {
8815     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8816         PERL_UNUSED_ARG(rx);
8817         if (0)
8818             return NULL;
8819         else
8820             return newSVpvs("Regexp");
8821 }
8822
8823 /* Scans the name of a named buffer from the pattern.
8824  * If flags is REG_RSN_RETURN_NULL returns null.
8825  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8826  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8827  * to the parsed name as looked up in the RExC_paren_names hash.
8828  * If there is an error throws a vFAIL().. type exception.
8829  */
8830
8831 #define REG_RSN_RETURN_NULL    0
8832 #define REG_RSN_RETURN_NAME    1
8833 #define REG_RSN_RETURN_DATA    2
8834
8835 STATIC SV*
8836 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8837 {
8838     char *name_start = RExC_parse;
8839     SV* sv_name;
8840
8841     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8842
8843     assert (RExC_parse <= RExC_end);
8844     if (RExC_parse == RExC_end) NOOP;
8845     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8846          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8847           * using do...while */
8848         if (UTF)
8849             do {
8850                 RExC_parse += UTF8SKIP(RExC_parse);
8851             } while (   RExC_parse < RExC_end
8852                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8853         else
8854             do {
8855                 RExC_parse++;
8856             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8857     } else {
8858         RExC_parse++; /* so the <- from the vFAIL is after the offending
8859                          character */
8860         vFAIL("Group name must start with a non-digit word character");
8861     }
8862     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8863                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8864     if ( flags == REG_RSN_RETURN_NAME)
8865         return sv_name;
8866     else if (flags==REG_RSN_RETURN_DATA) {
8867         HE *he_str = NULL;
8868         SV *sv_dat = NULL;
8869         if ( ! sv_name )      /* should not happen*/
8870             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8871         if (RExC_paren_names)
8872             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8873         if ( he_str )
8874             sv_dat = HeVAL(he_str);
8875         if ( ! sv_dat ) {   /* Didn't find group */
8876
8877             /* It might be a forward reference; we can't fail until we
8878                 * know, by completing the parse to get all the groups, and
8879                 * then reparsing */
8880             if (ALL_PARENS_COUNTED)  {
8881                 vFAIL("Reference to nonexistent named group");
8882             }
8883             else {
8884                 REQUIRE_PARENS_PASS;
8885             }
8886         }
8887         return sv_dat;
8888     }
8889
8890     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8891                      (unsigned long) flags);
8892 }
8893
8894 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8895     if (RExC_lastparse!=RExC_parse) {                           \
8896         Perl_re_printf( aTHX_  "%s",                            \
8897             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8898                 RExC_end - RExC_parse, 16,                      \
8899                 "", "",                                         \
8900                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8901                 PERL_PV_PRETTY_ELLIPSES   |                     \
8902                 PERL_PV_PRETTY_LTGT       |                     \
8903                 PERL_PV_ESCAPE_RE         |                     \
8904                 PERL_PV_PRETTY_EXACTSIZE                        \
8905             )                                                   \
8906         );                                                      \
8907     } else                                                      \
8908         Perl_re_printf( aTHX_ "%16s","");                       \
8909                                                                 \
8910     if (RExC_lastnum!=RExC_emit)                                \
8911        Perl_re_printf( aTHX_ "|%4d", RExC_emit);                \
8912     else                                                        \
8913        Perl_re_printf( aTHX_ "|%4s","");                        \
8914     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
8915         (int)((depth*2)), "",                                   \
8916         (funcname)                                              \
8917     );                                                          \
8918     RExC_lastnum=RExC_emit;                                     \
8919     RExC_lastparse=RExC_parse;                                  \
8920 })
8921
8922
8923
8924 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8925     DEBUG_PARSE_MSG((funcname));                            \
8926     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8927 })
8928 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8929     DEBUG_PARSE_MSG((funcname));                            \
8930     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8931 })
8932
8933 /* This section of code defines the inversion list object and its methods.  The
8934  * interfaces are highly subject to change, so as much as possible is static to
8935  * this file.  An inversion list is here implemented as a malloc'd C UV array
8936  * as an SVt_INVLIST scalar.
8937  *
8938  * An inversion list for Unicode is an array of code points, sorted by ordinal
8939  * number.  Each element gives the code point that begins a range that extends
8940  * up-to but not including the code point given by the next element.  The final
8941  * element gives the first code point of a range that extends to the platform's
8942  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8943  * ...) give ranges whose code points are all in the inversion list.  We say
8944  * that those ranges are in the set.  The odd-numbered elements give ranges
8945  * whose code points are not in the inversion list, and hence not in the set.
8946  * Thus, element [0] is the first code point in the list.  Element [1]
8947  * is the first code point beyond that not in the list; and element [2] is the
8948  * first code point beyond that that is in the list.  In other words, the first
8949  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8950  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8951  * all code points in that range are not in the inversion list.  The third
8952  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8953  * list, and so forth.  Thus every element whose index is divisible by two
8954  * gives the beginning of a range that is in the list, and every element whose
8955  * index is not divisible by two gives the beginning of a range not in the
8956  * list.  If the final element's index is divisible by two, the inversion list
8957  * extends to the platform's infinity; otherwise the highest code point in the
8958  * inversion list is the contents of that element minus 1.
8959  *
8960  * A range that contains just a single code point N will look like
8961  *  invlist[i]   == N
8962  *  invlist[i+1] == N+1
8963  *
8964  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8965  * impossible to represent, so element [i+1] is omitted.  The single element
8966  * inversion list
8967  *  invlist[0] == UV_MAX
8968  * contains just UV_MAX, but is interpreted as matching to infinity.
8969  *
8970  * Taking the complement (inverting) an inversion list is quite simple, if the
8971  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8972  * This implementation reserves an element at the beginning of each inversion
8973  * list to always contain 0; there is an additional flag in the header which
8974  * indicates if the list begins at the 0, or is offset to begin at the next
8975  * element.  This means that the inversion list can be inverted without any
8976  * copying; just flip the flag.
8977  *
8978  * More about inversion lists can be found in "Unicode Demystified"
8979  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8980  *
8981  * The inversion list data structure is currently implemented as an SV pointing
8982  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8983  * array of UV whose memory management is automatically handled by the existing
8984  * facilities for SV's.
8985  *
8986  * Some of the methods should always be private to the implementation, and some
8987  * should eventually be made public */
8988
8989 /* The header definitions are in F<invlist_inline.h> */
8990
8991 #ifndef PERL_IN_XSUB_RE
8992
8993 PERL_STATIC_INLINE UV*
8994 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8995 {
8996     /* Returns a pointer to the first element in the inversion list's array.
8997      * This is called upon initialization of an inversion list.  Where the
8998      * array begins depends on whether the list has the code point U+0000 in it
8999      * or not.  The other parameter tells it whether the code that follows this
9000      * call is about to put a 0 in the inversion list or not.  The first
9001      * element is either the element reserved for 0, if TRUE, or the element
9002      * after it, if FALSE */
9003
9004     bool* offset = get_invlist_offset_addr(invlist);
9005     UV* zero_addr = (UV *) SvPVX(invlist);
9006
9007     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9008
9009     /* Must be empty */
9010     assert(! _invlist_len(invlist));
9011
9012     *zero_addr = 0;
9013
9014     /* 1^1 = 0; 1^0 = 1 */
9015     *offset = 1 ^ will_have_0;
9016     return zero_addr + *offset;
9017 }
9018
9019 PERL_STATIC_INLINE void
9020 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
9021 {
9022     /* Sets the current number of elements stored in the inversion list.
9023      * Updates SvCUR correspondingly */
9024     PERL_UNUSED_CONTEXT;
9025     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
9026
9027     assert(is_invlist(invlist));
9028
9029     SvCUR_set(invlist,
9030               (len == 0)
9031                ? 0
9032                : TO_INTERNAL_SIZE(len + offset));
9033     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
9034 }
9035
9036 STATIC void
9037 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9038 {
9039     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9040      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9041      * is similar to what SvSetMagicSV() would do, if it were implemented on
9042      * inversion lists, though this routine avoids a copy */
9043
9044     const UV src_len          = _invlist_len(src);
9045     const bool src_offset     = *get_invlist_offset_addr(src);
9046     const STRLEN src_byte_len = SvLEN(src);
9047     char * array              = SvPVX(src);
9048
9049     const int oldtainted = TAINT_get;
9050
9051     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9052
9053     assert(is_invlist(src));
9054     assert(is_invlist(dest));
9055     assert(! invlist_is_iterating(src));
9056     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9057
9058     /* Make sure it ends in the right place with a NUL, as our inversion list
9059      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9060      * asserts it */
9061     array[src_byte_len - 1] = '\0';
9062
9063     TAINT_NOT;      /* Otherwise it breaks */
9064     sv_usepvn_flags(dest,
9065                     (char *) array,
9066                     src_byte_len - 1,
9067
9068                     /* This flag is documented to cause a copy to be avoided */
9069                     SV_HAS_TRAILING_NUL);
9070     TAINT_set(oldtainted);
9071     SvPV_set(src, 0);
9072     SvLEN_set(src, 0);
9073     SvCUR_set(src, 0);
9074
9075     /* Finish up copying over the other fields in an inversion list */
9076     *get_invlist_offset_addr(dest) = src_offset;
9077     invlist_set_len(dest, src_len, src_offset);
9078     *get_invlist_previous_index_addr(dest) = 0;
9079     invlist_iterfinish(dest);
9080 }
9081
9082 PERL_STATIC_INLINE IV*
9083 S_get_invlist_previous_index_addr(SV* invlist)
9084 {
9085     /* Return the address of the IV that is reserved to hold the cached index
9086      * */
9087     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9088
9089     assert(is_invlist(invlist));
9090
9091     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9092 }
9093
9094 PERL_STATIC_INLINE IV
9095 S_invlist_previous_index(SV* const invlist)
9096 {
9097     /* Returns cached index of previous search */
9098
9099     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9100
9101     return *get_invlist_previous_index_addr(invlist);
9102 }
9103
9104 PERL_STATIC_INLINE void
9105 S_invlist_set_previous_index(SV* const invlist, const IV index)
9106 {
9107     /* Caches <index> for later retrieval */
9108
9109     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9110
9111     assert(index == 0 || index < (int) _invlist_len(invlist));
9112
9113     *get_invlist_previous_index_addr(invlist) = index;
9114 }
9115
9116 PERL_STATIC_INLINE void
9117 S_invlist_trim(SV* invlist)
9118 {
9119     /* Free the not currently-being-used space in an inversion list */
9120
9121     /* But don't free up the space needed for the 0 UV that is always at the
9122      * beginning of the list, nor the trailing NUL */
9123     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9124
9125     PERL_ARGS_ASSERT_INVLIST_TRIM;
9126
9127     assert(is_invlist(invlist));
9128
9129     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9130 }
9131
9132 PERL_STATIC_INLINE void
9133 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9134 {
9135     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9136
9137     assert(is_invlist(invlist));
9138
9139     invlist_set_len(invlist, 0, 0);
9140     invlist_trim(invlist);
9141 }
9142
9143 #endif /* ifndef PERL_IN_XSUB_RE */
9144
9145 PERL_STATIC_INLINE bool
9146 S_invlist_is_iterating(SV* const invlist)
9147 {
9148     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9149
9150     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9151 }
9152
9153 #ifndef PERL_IN_XSUB_RE
9154
9155 PERL_STATIC_INLINE UV
9156 S_invlist_max(SV* const invlist)
9157 {
9158     /* Returns the maximum number of elements storable in the inversion list's
9159      * array, without having to realloc() */
9160
9161     PERL_ARGS_ASSERT_INVLIST_MAX;
9162
9163     assert(is_invlist(invlist));
9164
9165     /* Assumes worst case, in which the 0 element is not counted in the
9166      * inversion list, so subtracts 1 for that */
9167     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9168            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9169            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9170 }
9171
9172 STATIC void
9173 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9174 {
9175     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9176
9177     /* First 1 is in case the zero element isn't in the list; second 1 is for
9178      * trailing NUL */
9179     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9180     invlist_set_len(invlist, 0, 0);
9181
9182     /* Force iterinit() to be used to get iteration to work */
9183     invlist_iterfinish(invlist);
9184
9185     *get_invlist_previous_index_addr(invlist) = 0;
9186 }
9187
9188 SV*
9189 Perl__new_invlist(pTHX_ IV initial_size)
9190 {
9191
9192     /* Return a pointer to a newly constructed inversion list, with enough
9193      * space to store 'initial_size' elements.  If that number is negative, a
9194      * system default is used instead */
9195
9196     SV* new_list;
9197
9198     if (initial_size < 0) {
9199         initial_size = 10;
9200     }
9201
9202     new_list = newSV_type(SVt_INVLIST);
9203     initialize_invlist_guts(new_list, initial_size);
9204
9205     return new_list;
9206 }
9207
9208 SV*
9209 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9210 {
9211     /* Return a pointer to a newly constructed inversion list, initialized to
9212      * point to <list>, which has to be in the exact correct inversion list
9213      * form, including internal fields.  Thus this is a dangerous routine that
9214      * should not be used in the wrong hands.  The passed in 'list' contains
9215      * several header fields at the beginning that are not part of the
9216      * inversion list body proper */
9217
9218     const STRLEN length = (STRLEN) list[0];
9219     const UV version_id =          list[1];
9220     const bool offset   =    cBOOL(list[2]);
9221 #define HEADER_LENGTH 3
9222     /* If any of the above changes in any way, you must change HEADER_LENGTH
9223      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9224      *      perl -E 'say int(rand 2**31-1)'
9225      */
9226 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9227                                         data structure type, so that one being
9228                                         passed in can be validated to be an
9229                                         inversion list of the correct vintage.
9230                                        */
9231
9232     SV* invlist = newSV_type(SVt_INVLIST);
9233
9234     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9235
9236     if (version_id != INVLIST_VERSION_ID) {
9237         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9238     }
9239
9240     /* The generated array passed in includes header elements that aren't part
9241      * of the list proper, so start it just after them */
9242     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9243
9244     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9245                                shouldn't touch it */
9246
9247     *(get_invlist_offset_addr(invlist)) = offset;
9248
9249     /* The 'length' passed to us is the physical number of elements in the
9250      * inversion list.  But if there is an offset the logical number is one
9251      * less than that */
9252     invlist_set_len(invlist, length  - offset, offset);
9253
9254     invlist_set_previous_index(invlist, 0);
9255
9256     /* Initialize the iteration pointer. */
9257     invlist_iterfinish(invlist);
9258
9259     SvREADONLY_on(invlist);
9260
9261     return invlist;
9262 }
9263
9264 STATIC void
9265 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
9266 {
9267     /* Grow the maximum size of an inversion list */
9268
9269     PERL_ARGS_ASSERT_INVLIST_EXTEND;
9270
9271     assert(is_invlist(invlist));
9272
9273     /* Add one to account for the zero element at the beginning which may not
9274      * be counted by the calling parameters */
9275     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
9276 }
9277
9278 STATIC void
9279 S__append_range_to_invlist(pTHX_ SV* const invlist,
9280                                  const UV start, const UV end)
9281 {
9282    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9283     * the end of the inversion list.  The range must be above any existing
9284     * ones. */
9285
9286     UV* array;
9287     UV max = invlist_max(invlist);
9288     UV len = _invlist_len(invlist);
9289     bool offset;
9290
9291     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9292
9293     if (len == 0) { /* Empty lists must be initialized */
9294         offset = start != 0;
9295         array = _invlist_array_init(invlist, ! offset);
9296     }
9297     else {
9298         /* Here, the existing list is non-empty. The current max entry in the
9299          * list is generally the first value not in the set, except when the
9300          * set extends to the end of permissible values, in which case it is
9301          * the first entry in that final set, and so this call is an attempt to
9302          * append out-of-order */
9303
9304         UV final_element = len - 1;
9305         array = invlist_array(invlist);
9306         if (   array[final_element] > start
9307             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9308         {
9309             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9310                      array[final_element], start,
9311                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9312         }
9313
9314         /* Here, it is a legal append.  If the new range begins 1 above the end
9315          * of the range below it, it is extending the range below it, so the
9316          * new first value not in the set is one greater than the newly
9317          * extended range.  */
9318         offset = *get_invlist_offset_addr(invlist);
9319         if (array[final_element] == start) {
9320             if (end != UV_MAX) {
9321                 array[final_element] = end + 1;
9322             }
9323             else {
9324                 /* But if the end is the maximum representable on the machine,
9325                  * assume that infinity was actually what was meant.  Just let
9326                  * the range that this would extend to have no end */
9327                 invlist_set_len(invlist, len - 1, offset);
9328             }
9329             return;
9330         }
9331     }
9332
9333     /* Here the new range doesn't extend any existing set.  Add it */
9334
9335     len += 2;   /* Includes an element each for the start and end of range */
9336
9337     /* If wll overflow the existing space, extend, which may cause the array to
9338      * be moved */
9339     if (max < len) {
9340         invlist_extend(invlist, len);
9341
9342         /* Have to set len here to avoid assert failure in invlist_array() */
9343         invlist_set_len(invlist, len, offset);
9344
9345         array = invlist_array(invlist);
9346     }
9347     else {
9348         invlist_set_len(invlist, len, offset);
9349     }
9350
9351     /* The next item on the list starts the range, the one after that is
9352      * one past the new range.  */
9353     array[len - 2] = start;
9354     if (end != UV_MAX) {
9355         array[len - 1] = end + 1;
9356     }
9357     else {
9358         /* But if the end is the maximum representable on the machine, just let
9359          * the range have no end */
9360         invlist_set_len(invlist, len - 1, offset);
9361     }
9362 }
9363
9364 SSize_t
9365 Perl__invlist_search(SV* const invlist, const UV cp)
9366 {
9367     /* Searches the inversion list for the entry that contains the input code
9368      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9369      * return value is the index into the list's array of the range that
9370      * contains <cp>, that is, 'i' such that
9371      *  array[i] <= cp < array[i+1]
9372      */
9373
9374     IV low = 0;
9375     IV mid;
9376     IV high = _invlist_len(invlist);
9377     const IV highest_element = high - 1;
9378     const UV* array;
9379
9380     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9381
9382     /* If list is empty, return failure. */
9383     if (high == 0) {
9384         return -1;
9385     }
9386
9387     /* (We can't get the array unless we know the list is non-empty) */
9388     array = invlist_array(invlist);
9389
9390     mid = invlist_previous_index(invlist);
9391     assert(mid >=0);
9392     if (mid > highest_element) {
9393         mid = highest_element;
9394     }
9395
9396     /* <mid> contains the cache of the result of the previous call to this
9397      * function (0 the first time).  See if this call is for the same result,
9398      * or if it is for mid-1.  This is under the theory that calls to this
9399      * function will often be for related code points that are near each other.
9400      * And benchmarks show that caching gives better results.  We also test
9401      * here if the code point is within the bounds of the list.  These tests
9402      * replace others that would have had to be made anyway to make sure that
9403      * the array bounds were not exceeded, and these give us extra information
9404      * at the same time */
9405     if (cp >= array[mid]) {
9406         if (cp >= array[highest_element]) {
9407             return highest_element;
9408         }
9409
9410         /* Here, array[mid] <= cp < array[highest_element].  This means that
9411          * the final element is not the answer, so can exclude it; it also
9412          * means that <mid> is not the final element, so can refer to 'mid + 1'
9413          * safely */
9414         if (cp < array[mid + 1]) {
9415             return mid;
9416         }
9417         high--;
9418         low = mid + 1;
9419     }
9420     else { /* cp < aray[mid] */
9421         if (cp < array[0]) { /* Fail if outside the array */
9422             return -1;
9423         }
9424         high = mid;
9425         if (cp >= array[mid - 1]) {
9426             goto found_entry;
9427         }
9428     }
9429
9430     /* Binary search.  What we are looking for is <i> such that
9431      *  array[i] <= cp < array[i+1]
9432      * The loop below converges on the i+1.  Note that there may not be an
9433      * (i+1)th element in the array, and things work nonetheless */
9434     while (low < high) {
9435         mid = (low + high) / 2;
9436         assert(mid <= highest_element);
9437         if (array[mid] <= cp) { /* cp >= array[mid] */
9438             low = mid + 1;
9439
9440             /* We could do this extra test to exit the loop early.
9441             if (cp < array[low]) {
9442                 return mid;
9443             }
9444             */
9445         }
9446         else { /* cp < array[mid] */
9447             high = mid;
9448         }
9449     }
9450
9451   found_entry:
9452     high--;
9453     invlist_set_previous_index(invlist, high);
9454     return high;
9455 }
9456
9457 void
9458 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9459                                          const bool complement_b, SV** output)
9460 {
9461     /* Take the union of two inversion lists and point '*output' to it.  On
9462      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9463      * even 'a' or 'b').  If to an inversion list, the contents of the original
9464      * list will be replaced by the union.  The first list, 'a', may be
9465      * NULL, in which case a copy of the second list is placed in '*output'.
9466      * If 'complement_b' is TRUE, the union is taken of the complement
9467      * (inversion) of 'b' instead of b itself.
9468      *
9469      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9470      * Richard Gillam, published by Addison-Wesley, and explained at some
9471      * length there.  The preface says to incorporate its examples into your
9472      * code at your own risk.
9473      *
9474      * The algorithm is like a merge sort. */
9475
9476     const UV* array_a;    /* a's array */
9477     const UV* array_b;
9478     UV len_a;       /* length of a's array */
9479     UV len_b;
9480
9481     SV* u;                      /* the resulting union */
9482     UV* array_u;
9483     UV len_u = 0;
9484
9485     UV i_a = 0;             /* current index into a's array */
9486     UV i_b = 0;
9487     UV i_u = 0;
9488
9489     /* running count, as explained in the algorithm source book; items are
9490      * stopped accumulating and are output when the count changes to/from 0.
9491      * The count is incremented when we start a range that's in an input's set,
9492      * and decremented when we start a range that's not in a set.  So this
9493      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9494      * and hence nothing goes into the union; 1, just one of the inputs is in
9495      * its set (and its current range gets added to the union); and 2 when both
9496      * inputs are in their sets.  */
9497     UV count = 0;
9498
9499     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9500     assert(a != b);
9501     assert(*output == NULL || is_invlist(*output));
9502
9503     len_b = _invlist_len(b);
9504     if (len_b == 0) {
9505
9506         /* Here, 'b' is empty, hence it's complement is all possible code
9507          * points.  So if the union includes the complement of 'b', it includes
9508          * everything, and we need not even look at 'a'.  It's easiest to
9509          * create a new inversion list that matches everything.  */
9510         if (complement_b) {
9511             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9512
9513             if (*output == NULL) { /* If the output didn't exist, just point it
9514                                       at the new list */
9515                 *output = everything;
9516             }
9517             else { /* Otherwise, replace its contents with the new list */
9518                 invlist_replace_list_destroys_src(*output, everything);
9519                 SvREFCNT_dec_NN(everything);
9520             }
9521
9522             return;
9523         }
9524
9525         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9526          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9527          * output will be empty */
9528
9529         if (a == NULL || _invlist_len(a) == 0) {
9530             if (*output == NULL) {
9531                 *output = _new_invlist(0);
9532             }
9533             else {
9534                 invlist_clear(*output);
9535             }
9536             return;
9537         }
9538
9539         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9540          * union.  We can just return a copy of 'a' if '*output' doesn't point
9541          * to an existing list */
9542         if (*output == NULL) {
9543             *output = invlist_clone(a, NULL);
9544             return;
9545         }
9546
9547         /* If the output is to overwrite 'a', we have a no-op, as it's
9548          * already in 'a' */
9549         if (*output == a) {
9550             return;
9551         }
9552
9553         /* Here, '*output' is to be overwritten by 'a' */
9554         u = invlist_clone(a, NULL);
9555         invlist_replace_list_destroys_src(*output, u);
9556         SvREFCNT_dec_NN(u);
9557
9558         return;
9559     }
9560
9561     /* Here 'b' is not empty.  See about 'a' */
9562
9563     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9564
9565         /* Here, 'a' is empty (and b is not).  That means the union will come
9566          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9567          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9568          * the clone */
9569
9570         SV ** dest = (*output == NULL) ? output : &u;
9571         *dest = invlist_clone(b, NULL);
9572         if (complement_b) {
9573             _invlist_invert(*dest);
9574         }
9575
9576         if (dest == &u) {
9577             invlist_replace_list_destroys_src(*output, u);
9578             SvREFCNT_dec_NN(u);
9579         }
9580
9581         return;
9582     }
9583
9584     /* Here both lists exist and are non-empty */
9585     array_a = invlist_array(a);
9586     array_b = invlist_array(b);
9587
9588     /* If are to take the union of 'a' with the complement of b, set it
9589      * up so are looking at b's complement. */
9590     if (complement_b) {
9591
9592         /* To complement, we invert: if the first element is 0, remove it.  To
9593          * do this, we just pretend the array starts one later */
9594         if (array_b[0] == 0) {
9595             array_b++;
9596             len_b--;
9597         }
9598         else {
9599
9600             /* But if the first element is not zero, we pretend the list starts
9601              * at the 0 that is always stored immediately before the array. */
9602             array_b--;
9603             len_b++;
9604         }
9605     }
9606
9607     /* Size the union for the worst case: that the sets are completely
9608      * disjoint */
9609     u = _new_invlist(len_a + len_b);
9610
9611     /* Will contain U+0000 if either component does */
9612     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9613                                       || (len_b > 0 && array_b[0] == 0));
9614
9615     /* Go through each input list item by item, stopping when have exhausted
9616      * one of them */
9617     while (i_a < len_a && i_b < len_b) {
9618         UV cp;      /* The element to potentially add to the union's array */
9619         bool cp_in_set;   /* is it in the the input list's set or not */
9620
9621         /* We need to take one or the other of the two inputs for the union.
9622          * Since we are merging two sorted lists, we take the smaller of the
9623          * next items.  In case of a tie, we take first the one that is in its
9624          * set.  If we first took the one not in its set, it would decrement
9625          * the count, possibly to 0 which would cause it to be output as ending
9626          * the range, and the next time through we would take the same number,
9627          * and output it again as beginning the next range.  By doing it the
9628          * opposite way, there is no possibility that the count will be
9629          * momentarily decremented to 0, and thus the two adjoining ranges will
9630          * be seamlessly merged.  (In a tie and both are in the set or both not
9631          * in the set, it doesn't matter which we take first.) */
9632         if (       array_a[i_a] < array_b[i_b]
9633             || (   array_a[i_a] == array_b[i_b]
9634                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9635         {
9636             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9637             cp = array_a[i_a++];
9638         }
9639         else {
9640             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9641             cp = array_b[i_b++];
9642         }
9643
9644         /* Here, have chosen which of the two inputs to look at.  Only output
9645          * if the running count changes to/from 0, which marks the
9646          * beginning/end of a range that's in the set */
9647         if (cp_in_set) {
9648             if (count == 0) {
9649                 array_u[i_u++] = cp;
9650             }
9651             count++;
9652         }
9653         else {
9654             count--;
9655             if (count == 0) {
9656                 array_u[i_u++] = cp;
9657             }
9658         }
9659     }
9660
9661
9662     /* The loop above increments the index into exactly one of the input lists
9663      * each iteration, and ends when either index gets to its list end.  That
9664      * means the other index is lower than its end, and so something is
9665      * remaining in that one.  We decrement 'count', as explained below, if
9666      * that list is in its set.  (i_a and i_b each currently index the element
9667      * beyond the one we care about.) */
9668     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9669         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9670     {
9671         count--;
9672     }
9673
9674     /* Above we decremented 'count' if the list that had unexamined elements in
9675      * it was in its set.  This has made it so that 'count' being non-zero
9676      * means there isn't anything left to output; and 'count' equal to 0 means
9677      * that what is left to output is precisely that which is left in the
9678      * non-exhausted input list.
9679      *
9680      * To see why, note first that the exhausted input obviously has nothing
9681      * left to add to the union.  If it was in its set at its end, that means
9682      * the set extends from here to the platform's infinity, and hence so does
9683      * the union and the non-exhausted set is irrelevant.  The exhausted set
9684      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9685      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9686      * 'count' remains at 1.  This is consistent with the decremented 'count'
9687      * != 0 meaning there's nothing left to add to the union.
9688      *
9689      * But if the exhausted input wasn't in its set, it contributed 0 to
9690      * 'count', and the rest of the union will be whatever the other input is.
9691      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9692      * otherwise it gets decremented to 0.  This is consistent with 'count'
9693      * == 0 meaning the remainder of the union is whatever is left in the
9694      * non-exhausted list. */
9695     if (count != 0) {
9696         len_u = i_u;
9697     }
9698     else {
9699         IV copy_count = len_a - i_a;
9700         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9701             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9702         }
9703         else { /* The non-exhausted input is b */
9704             copy_count = len_b - i_b;
9705             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9706         }
9707         len_u = i_u + copy_count;
9708     }
9709
9710     /* Set the result to the final length, which can change the pointer to
9711      * array_u, so re-find it.  (Note that it is unlikely that this will
9712      * change, as we are shrinking the space, not enlarging it) */
9713     if (len_u != _invlist_len(u)) {
9714         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9715         invlist_trim(u);
9716         array_u = invlist_array(u);
9717     }
9718
9719     if (*output == NULL) {  /* Simply return the new inversion list */
9720         *output = u;
9721     }
9722     else {
9723         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9724          * could instead free '*output', and then set it to 'u', but experience
9725          * has shown [perl #127392] that if the input is a mortal, we can get a
9726          * huge build-up of these during regex compilation before they get
9727          * freed. */
9728         invlist_replace_list_destroys_src(*output, u);
9729         SvREFCNT_dec_NN(u);
9730     }
9731
9732     return;
9733 }
9734
9735 void
9736 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9737                                                const bool complement_b, SV** i)
9738 {
9739     /* Take the intersection of two inversion lists and point '*i' to it.  On
9740      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9741      * even 'a' or 'b').  If to an inversion list, the contents of the original
9742      * list will be replaced by the intersection.  The first list, 'a', may be
9743      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9744      * TRUE, the result will be the intersection of 'a' and the complement (or
9745      * inversion) of 'b' instead of 'b' directly.
9746      *
9747      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9748      * Richard Gillam, published by Addison-Wesley, and explained at some
9749      * length there.  The preface says to incorporate its examples into your
9750      * code at your own risk.  In fact, it had bugs
9751      *
9752      * The algorithm is like a merge sort, and is essentially the same as the
9753      * union above
9754      */
9755
9756     const UV* array_a;          /* a's array */
9757     const UV* array_b;
9758     UV len_a;   /* length of a's array */
9759     UV len_b;
9760
9761     SV* r;                   /* the resulting intersection */
9762     UV* array_r;
9763     UV len_r = 0;
9764
9765     UV i_a = 0;             /* current index into a's array */
9766     UV i_b = 0;
9767     UV i_r = 0;
9768
9769     /* running count of how many of the two inputs are postitioned at ranges
9770      * that are in their sets.  As explained in the algorithm source book,
9771      * items are stopped accumulating and are output when the count changes
9772      * to/from 2.  The count is incremented when we start a range that's in an
9773      * input's set, and decremented when we start a range that's not in a set.
9774      * Only when it is 2 are we in the intersection. */
9775     UV count = 0;
9776
9777     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9778     assert(a != b);
9779     assert(*i == NULL || is_invlist(*i));
9780
9781     /* Special case if either one is empty */
9782     len_a = (a == NULL) ? 0 : _invlist_len(a);
9783     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9784         if (len_a != 0 && complement_b) {
9785
9786             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9787              * must be empty.  Here, also we are using 'b's complement, which
9788              * hence must be every possible code point.  Thus the intersection
9789              * is simply 'a'. */
9790
9791             if (*i == a) {  /* No-op */
9792                 return;
9793             }
9794
9795             if (*i == NULL) {
9796                 *i = invlist_clone(a, NULL);
9797                 return;
9798             }
9799
9800             r = invlist_clone(a, NULL);
9801             invlist_replace_list_destroys_src(*i, r);
9802             SvREFCNT_dec_NN(r);
9803             return;
9804         }
9805
9806         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9807          * intersection must be empty */
9808         if (*i == NULL) {
9809             *i = _new_invlist(0);
9810             return;
9811         }
9812
9813         invlist_clear(*i);
9814         return;
9815     }
9816
9817     /* Here both lists exist and are non-empty */
9818     array_a = invlist_array(a);
9819     array_b = invlist_array(b);
9820
9821     /* If are to take the intersection of 'a' with the complement of b, set it
9822      * up so are looking at b's complement. */
9823     if (complement_b) {
9824
9825         /* To complement, we invert: if the first element is 0, remove it.  To
9826          * do this, we just pretend the array starts one later */
9827         if (array_b[0] == 0) {
9828             array_b++;
9829             len_b--;
9830         }
9831         else {
9832
9833             /* But if the first element is not zero, we pretend the list starts
9834              * at the 0 that is always stored immediately before the array. */
9835             array_b--;
9836             len_b++;
9837         }
9838     }
9839
9840     /* Size the intersection for the worst case: that the intersection ends up
9841      * fragmenting everything to be completely disjoint */
9842     r= _new_invlist(len_a + len_b);
9843
9844     /* Will contain U+0000 iff both components do */
9845     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9846                                      && len_b > 0 && array_b[0] == 0);
9847
9848     /* Go through each list item by item, stopping when have exhausted one of
9849      * them */
9850     while (i_a < len_a && i_b < len_b) {
9851         UV cp;      /* The element to potentially add to the intersection's
9852                        array */
9853         bool cp_in_set; /* Is it in the input list's set or not */
9854
9855         /* We need to take one or the other of the two inputs for the
9856          * intersection.  Since we are merging two sorted lists, we take the
9857          * smaller of the next items.  In case of a tie, we take first the one
9858          * that is not in its set (a difference from the union algorithm).  If
9859          * we first took the one in its set, it would increment the count,
9860          * possibly to 2 which would cause it to be output as starting a range
9861          * in the intersection, and the next time through we would take that
9862          * same number, and output it again as ending the set.  By doing the
9863          * opposite of this, there is no possibility that the count will be
9864          * momentarily incremented to 2.  (In a tie and both are in the set or
9865          * both not in the set, it doesn't matter which we take first.) */
9866         if (       array_a[i_a] < array_b[i_b]
9867             || (   array_a[i_a] == array_b[i_b]
9868                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9869         {
9870             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9871             cp = array_a[i_a++];
9872         }
9873         else {
9874             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9875             cp= array_b[i_b++];
9876         }
9877
9878         /* Here, have chosen which of the two inputs to look at.  Only output
9879          * if the running count changes to/from 2, which marks the
9880          * beginning/end of a range that's in the intersection */
9881         if (cp_in_set) {
9882             count++;
9883             if (count == 2) {
9884                 array_r[i_r++] = cp;
9885             }
9886         }
9887         else {
9888             if (count == 2) {
9889                 array_r[i_r++] = cp;
9890             }
9891             count--;
9892         }
9893
9894     }
9895
9896     /* The loop above increments the index into exactly one of the input lists
9897      * each iteration, and ends when either index gets to its list end.  That
9898      * means the other index is lower than its end, and so something is
9899      * remaining in that one.  We increment 'count', as explained below, if the
9900      * exhausted list was in its set.  (i_a and i_b each currently index the
9901      * element beyond the one we care about.) */
9902     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9903         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9904     {
9905         count++;
9906     }
9907
9908     /* Above we incremented 'count' if the exhausted list was in its set.  This
9909      * has made it so that 'count' being below 2 means there is nothing left to
9910      * output; otheriwse what's left to add to the intersection is precisely
9911      * that which is left in the non-exhausted input list.
9912      *
9913      * To see why, note first that the exhausted input obviously has nothing
9914      * left to affect the intersection.  If it was in its set at its end, that
9915      * means the set extends from here to the platform's infinity, and hence
9916      * anything in the non-exhausted's list will be in the intersection, and
9917      * anything not in it won't be.  Hence, the rest of the intersection is
9918      * precisely what's in the non-exhausted list  The exhausted set also
9919      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9920      * it means 'count' is now at least 2.  This is consistent with the
9921      * incremented 'count' being >= 2 means to add the non-exhausted list to
9922      * the intersection.
9923      *
9924      * But if the exhausted input wasn't in its set, it contributed 0 to
9925      * 'count', and the intersection can't include anything further; the
9926      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9927      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9928      * further to add to the intersection. */
9929     if (count < 2) { /* Nothing left to put in the intersection. */
9930         len_r = i_r;
9931     }
9932     else { /* copy the non-exhausted list, unchanged. */
9933         IV copy_count = len_a - i_a;
9934         if (copy_count > 0) {   /* a is the one with stuff left */
9935             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9936         }
9937         else {  /* b is the one with stuff left */
9938             copy_count = len_b - i_b;
9939             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9940         }
9941         len_r = i_r + copy_count;
9942     }
9943
9944     /* Set the result to the final length, which can change the pointer to
9945      * array_r, so re-find it.  (Note that it is unlikely that this will
9946      * change, as we are shrinking the space, not enlarging it) */
9947     if (len_r != _invlist_len(r)) {
9948         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9949         invlist_trim(r);
9950         array_r = invlist_array(r);
9951     }
9952
9953     if (*i == NULL) { /* Simply return the calculated intersection */
9954         *i = r;
9955     }
9956     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9957               instead free '*i', and then set it to 'r', but experience has
9958               shown [perl #127392] that if the input is a mortal, we can get a
9959               huge build-up of these during regex compilation before they get
9960               freed. */
9961         if (len_r) {
9962             invlist_replace_list_destroys_src(*i, r);
9963         }
9964         else {
9965             invlist_clear(*i);
9966         }
9967         SvREFCNT_dec_NN(r);
9968     }
9969
9970     return;
9971 }
9972
9973 SV*
9974 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9975 {
9976     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9977      * set.  A pointer to the inversion list is returned.  This may actually be
9978      * a new list, in which case the passed in one has been destroyed.  The
9979      * passed-in inversion list can be NULL, in which case a new one is created
9980      * with just the one range in it.  The new list is not necessarily
9981      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9982      * result of this function.  The gain would not be large, and in many
9983      * cases, this is called multiple times on a single inversion list, so
9984      * anything freed may almost immediately be needed again.
9985      *
9986      * This used to mostly call the 'union' routine, but that is much more
9987      * heavyweight than really needed for a single range addition */
9988
9989     UV* array;              /* The array implementing the inversion list */
9990     UV len;                 /* How many elements in 'array' */
9991     SSize_t i_s;            /* index into the invlist array where 'start'
9992                                should go */
9993     SSize_t i_e = 0;        /* And the index where 'end' should go */
9994     UV cur_highest;         /* The highest code point in the inversion list
9995                                upon entry to this function */
9996
9997     /* This range becomes the whole inversion list if none already existed */
9998     if (invlist == NULL) {
9999         invlist = _new_invlist(2);
10000         _append_range_to_invlist(invlist, start, end);
10001         return invlist;
10002     }
10003
10004     /* Likewise, if the inversion list is currently empty */
10005     len = _invlist_len(invlist);
10006     if (len == 0) {
10007         _append_range_to_invlist(invlist, start, end);
10008         return invlist;
10009     }
10010
10011     /* Starting here, we have to know the internals of the list */
10012     array = invlist_array(invlist);
10013
10014     /* If the new range ends higher than the current highest ... */
10015     cur_highest = invlist_highest(invlist);
10016     if (end > cur_highest) {
10017
10018         /* If the whole range is higher, we can just append it */
10019         if (start > cur_highest) {
10020             _append_range_to_invlist(invlist, start, end);
10021             return invlist;
10022         }
10023
10024         /* Otherwise, add the portion that is higher ... */
10025         _append_range_to_invlist(invlist, cur_highest + 1, end);
10026
10027         /* ... and continue on below to handle the rest.  As a result of the
10028          * above append, we know that the index of the end of the range is the
10029          * final even numbered one of the array.  Recall that the final element
10030          * always starts a range that extends to infinity.  If that range is in
10031          * the set (meaning the set goes from here to infinity), it will be an
10032          * even index, but if it isn't in the set, it's odd, and the final
10033          * range in the set is one less, which is even. */
10034         if (end == UV_MAX) {
10035             i_e = len;
10036         }
10037         else {
10038             i_e = len - 2;
10039         }
10040     }
10041
10042     /* We have dealt with appending, now see about prepending.  If the new
10043      * range starts lower than the current lowest ... */
10044     if (start < array[0]) {
10045
10046         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10047          * Let the union code handle it, rather than having to know the
10048          * trickiness in two code places.  */
10049         if (UNLIKELY(start == 0)) {
10050             SV* range_invlist;
10051
10052             range_invlist = _new_invlist(2);
10053             _append_range_to_invlist(range_invlist, start, end);
10054
10055             _invlist_union(invlist, range_invlist, &invlist);
10056
10057             SvREFCNT_dec_NN(range_invlist);
10058
10059             return invlist;
10060         }
10061
10062         /* If the whole new range comes before the first entry, and doesn't
10063          * extend it, we have to insert it as an additional range */
10064         if (end < array[0] - 1) {
10065             i_s = i_e = -1;
10066             goto splice_in_new_range;
10067         }
10068
10069         /* Here the new range adjoins the existing first range, extending it
10070          * downwards. */
10071         array[0] = start;
10072
10073         /* And continue on below to handle the rest.  We know that the index of
10074          * the beginning of the range is the first one of the array */
10075         i_s = 0;
10076     }
10077     else { /* Not prepending any part of the new range to the existing list.
10078             * Find where in the list it should go.  This finds i_s, such that:
10079             *     invlist[i_s] <= start < array[i_s+1]
10080             */
10081         i_s = _invlist_search(invlist, start);
10082     }
10083
10084     /* At this point, any extending before the beginning of the inversion list
10085      * and/or after the end has been done.  This has made it so that, in the
10086      * code below, each endpoint of the new range is either in a range that is
10087      * in the set, or is in a gap between two ranges that are.  This means we
10088      * don't have to worry about exceeding the array bounds.
10089      *
10090      * Find where in the list the new range ends (but we can skip this if we
10091      * have already determined what it is, or if it will be the same as i_s,
10092      * which we already have computed) */
10093     if (i_e == 0) {
10094         i_e = (start == end)
10095               ? i_s
10096               : _invlist_search(invlist, end);
10097     }
10098
10099     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10100      * is a range that goes to infinity there is no element at invlist[i_e+1],
10101      * so only the first relation holds. */
10102
10103     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10104
10105         /* Here, the ranges on either side of the beginning of the new range
10106          * are in the set, and this range starts in the gap between them.
10107          *
10108          * The new range extends the range above it downwards if the new range
10109          * ends at or above that range's start */
10110         const bool extends_the_range_above = (   end == UV_MAX
10111                                               || end + 1 >= array[i_s+1]);
10112
10113         /* The new range extends the range below it upwards if it begins just
10114          * after where that range ends */
10115         if (start == array[i_s]) {
10116
10117             /* If the new range fills the entire gap between the other ranges,
10118              * they will get merged together.  Other ranges may also get
10119              * merged, depending on how many of them the new range spans.  In
10120              * the general case, we do the merge later, just once, after we
10121              * figure out how many to merge.  But in the case where the new
10122              * range exactly spans just this one gap (possibly extending into
10123              * the one above), we do the merge here, and an early exit.  This
10124              * is done here to avoid having to special case later. */
10125             if (i_e - i_s <= 1) {
10126
10127                 /* If i_e - i_s == 1, it means that the new range terminates
10128                  * within the range above, and hence 'extends_the_range_above'
10129                  * must be true.  (If the range above it extends to infinity,
10130                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10131                  * will be 0, so no harm done.) */
10132                 if (extends_the_range_above) {
10133                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10134                     invlist_set_len(invlist,
10135                                     len - 2,
10136                                     *(get_invlist_offset_addr(invlist)));
10137                     return invlist;
10138                 }
10139
10140                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10141                  * to the same range, and below we are about to decrement i_s
10142                  * */
10143                 i_e--;
10144             }
10145
10146             /* Here, the new range is adjacent to the one below.  (It may also
10147              * span beyond the range above, but that will get resolved later.)
10148              * Extend the range below to include this one. */
10149             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10150             i_s--;
10151             start = array[i_s];
10152         }
10153         else if (extends_the_range_above) {
10154
10155             /* Here the new range only extends the range above it, but not the
10156              * one below.  It merges with the one above.  Again, we keep i_e
10157              * and i_s in sync if they point to the same range */
10158             if (i_e == i_s) {
10159                 i_e++;
10160             }
10161             i_s++;
10162             array[i_s] = start;
10163         }
10164     }
10165
10166     /* Here, we've dealt with the new range start extending any adjoining
10167      * existing ranges.
10168      *
10169      * If the new range extends to infinity, it is now the final one,
10170      * regardless of what was there before */
10171     if (UNLIKELY(end == UV_MAX)) {
10172         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10173         return invlist;
10174     }
10175
10176     /* If i_e started as == i_s, it has also been dealt with,
10177      * and been updated to the new i_s, which will fail the following if */
10178     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10179
10180         /* Here, the ranges on either side of the end of the new range are in
10181          * the set, and this range ends in the gap between them.
10182          *
10183          * If this range is adjacent to (hence extends) the range above it, it
10184          * becomes part of that range; likewise if it extends the range below,
10185          * it becomes part of that range */
10186         if (end + 1 == array[i_e+1]) {
10187             i_e++;
10188             array[i_e] = start;
10189         }
10190         else if (start <= array[i_e]) {
10191             array[i_e] = end + 1;
10192             i_e--;
10193         }
10194     }
10195
10196     if (i_s == i_e) {
10197
10198         /* If the range fits entirely in an existing range (as possibly already
10199          * extended above), it doesn't add anything new */
10200         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10201             return invlist;
10202         }
10203
10204         /* Here, no part of the range is in the list.  Must add it.  It will
10205          * occupy 2 more slots */
10206       splice_in_new_range:
10207
10208         invlist_extend(invlist, len + 2);
10209         array = invlist_array(invlist);
10210         /* Move the rest of the array down two slots. Don't include any
10211          * trailing NUL */
10212         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10213
10214         /* Do the actual splice */
10215         array[i_e+1] = start;
10216         array[i_e+2] = end + 1;
10217         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10218         return invlist;
10219     }
10220
10221     /* Here the new range crossed the boundaries of a pre-existing range.  The
10222      * code above has adjusted things so that both ends are in ranges that are
10223      * in the set.  This means everything in between must also be in the set.
10224      * Just squash things together */
10225     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10226     invlist_set_len(invlist,
10227                     len - i_e + i_s,
10228                     *(get_invlist_offset_addr(invlist)));
10229
10230     return invlist;
10231 }
10232
10233 SV*
10234 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10235                                  UV** other_elements_ptr)
10236 {
10237     /* Create and return an inversion list whose contents are to be populated
10238      * by the caller.  The caller gives the number of elements (in 'size') and
10239      * the very first element ('element0').  This function will set
10240      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10241      * are to be placed.
10242      *
10243      * Obviously there is some trust involved that the caller will properly
10244      * fill in the other elements of the array.
10245      *
10246      * (The first element needs to be passed in, as the underlying code does
10247      * things differently depending on whether it is zero or non-zero) */
10248
10249     SV* invlist = _new_invlist(size);
10250     bool offset;
10251
10252     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10253
10254     invlist = add_cp_to_invlist(invlist, element0);
10255     offset = *get_invlist_offset_addr(invlist);
10256
10257     invlist_set_len(invlist, size, offset);
10258     *other_elements_ptr = invlist_array(invlist) + 1;
10259     return invlist;
10260 }
10261
10262 #endif
10263
10264 PERL_STATIC_INLINE SV*
10265 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
10266     return _add_range_to_invlist(invlist, cp, cp);
10267 }
10268
10269 #ifndef PERL_IN_XSUB_RE
10270 void
10271 Perl__invlist_invert(pTHX_ SV* const invlist)
10272 {
10273     /* Complement the input inversion list.  This adds a 0 if the list didn't
10274      * have a zero; removes it otherwise.  As described above, the data
10275      * structure is set up so that this is very efficient */
10276
10277     PERL_ARGS_ASSERT__INVLIST_INVERT;
10278
10279     assert(! invlist_is_iterating(invlist));
10280
10281     /* The inverse of matching nothing is matching everything */
10282     if (_invlist_len(invlist) == 0) {
10283         _append_range_to_invlist(invlist, 0, UV_MAX);
10284         return;
10285     }
10286
10287     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10288 }
10289
10290 SV*
10291 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10292 {
10293     /* Return a new inversion list that is a copy of the input one, which is
10294      * unchanged.  The new list will not be mortal even if the old one was. */
10295
10296     const STRLEN nominal_length = _invlist_len(invlist);
10297     const STRLEN physical_length = SvCUR(invlist);
10298     const bool offset = *(get_invlist_offset_addr(invlist));
10299
10300     PERL_ARGS_ASSERT_INVLIST_CLONE;
10301
10302     if (new_invlist == NULL) {
10303         new_invlist = _new_invlist(nominal_length);
10304     }
10305     else {
10306         sv_upgrade(new_invlist, SVt_INVLIST);
10307         initialize_invlist_guts(new_invlist, nominal_length);
10308     }
10309
10310     *(get_invlist_offset_addr(new_invlist)) = offset;
10311     invlist_set_len(new_invlist, nominal_length, offset);
10312     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10313
10314     return new_invlist;
10315 }
10316
10317 #endif
10318
10319 PERL_STATIC_INLINE STRLEN*
10320 S_get_invlist_iter_addr(SV* invlist)
10321 {
10322     /* Return the address of the UV that contains the current iteration
10323      * position */
10324
10325     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
10326
10327     assert(is_invlist(invlist));
10328
10329     return &(((XINVLIST*) SvANY(invlist))->iterator);
10330 }
10331
10332 PERL_STATIC_INLINE void
10333 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
10334 {
10335     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
10336
10337     *get_invlist_iter_addr(invlist) = 0;
10338 }
10339
10340 PERL_STATIC_INLINE void
10341 S_invlist_iterfinish(SV* invlist)
10342 {
10343     /* Terminate iterator for invlist.  This is to catch development errors.
10344      * Any iteration that is interrupted before completed should call this
10345      * function.  Functions that add code points anywhere else but to the end
10346      * of an inversion list assert that they are not in the middle of an
10347      * iteration.  If they were, the addition would make the iteration
10348      * problematical: if the iteration hadn't reached the place where things
10349      * were being added, it would be ok */
10350
10351     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
10352
10353     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
10354 }
10355
10356 STATIC bool
10357 S_invlist_iternext(SV* invlist, UV* start, UV* end)
10358 {
10359     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
10360      * This call sets in <*start> and <*end>, the next range in <invlist>.
10361      * Returns <TRUE> if successful and the next call will return the next
10362      * range; <FALSE> if was already at the end of the list.  If the latter,
10363      * <*start> and <*end> are unchanged, and the next call to this function
10364      * will start over at the beginning of the list */
10365
10366     STRLEN* pos = get_invlist_iter_addr(invlist);
10367     UV len = _invlist_len(invlist);
10368     UV *array;
10369
10370     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
10371
10372     if (*pos >= len) {
10373         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
10374         return FALSE;
10375     }
10376
10377     array = invlist_array(invlist);
10378
10379     *start = array[(*pos)++];
10380
10381     if (*pos >= len) {
10382         *end = UV_MAX;
10383     }
10384     else {
10385         *end = array[(*pos)++] - 1;
10386     }
10387
10388     return TRUE;
10389 }
10390
10391 PERL_STATIC_INLINE UV
10392 S_invlist_highest(SV* const invlist)
10393 {
10394     /* Returns the highest code point that matches an inversion list.  This API
10395      * has an ambiguity, as it returns 0 under either the highest is actually
10396      * 0, or if the list is empty.  If this distinction matters to you, check
10397      * for emptiness before calling this function */
10398
10399     UV len = _invlist_len(invlist);
10400     UV *array;
10401
10402     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10403
10404     if (len == 0) {
10405         return 0;
10406     }
10407
10408     array = invlist_array(invlist);
10409
10410     /* The last element in the array in the inversion list always starts a
10411      * range that goes to infinity.  That range may be for code points that are
10412      * matched in the inversion list, or it may be for ones that aren't
10413      * matched.  In the latter case, the highest code point in the set is one
10414      * less than the beginning of this range; otherwise it is the final element
10415      * of this range: infinity */
10416     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10417            ? UV_MAX
10418            : array[len - 1] - 1;
10419 }
10420
10421 STATIC SV *
10422 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10423 {
10424     /* Get the contents of an inversion list into a string SV so that they can
10425      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10426      * traditionally done for debug tracing; otherwise it uses a format
10427      * suitable for just copying to the output, with blanks between ranges and
10428      * a dash between range components */
10429
10430     UV start, end;
10431     SV* output;
10432     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10433     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10434
10435     if (traditional_style) {
10436         output = newSVpvs("\n");
10437     }
10438     else {
10439         output = newSVpvs("");
10440     }
10441
10442     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10443
10444     assert(! invlist_is_iterating(invlist));
10445
10446     invlist_iterinit(invlist);
10447     while (invlist_iternext(invlist, &start, &end)) {
10448         if (end == UV_MAX) {
10449             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10450                                           start, intra_range_delimiter,
10451                                                  inter_range_delimiter);
10452         }
10453         else if (end != start) {
10454             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10455                                           start,
10456                                                    intra_range_delimiter,
10457                                                   end, inter_range_delimiter);
10458         }
10459         else {
10460             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10461                                           start, inter_range_delimiter);
10462         }
10463     }
10464
10465     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10466         SvCUR_set(output, SvCUR(output) - 1);
10467     }
10468
10469     return output;
10470 }
10471
10472 #ifndef PERL_IN_XSUB_RE
10473 void
10474 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10475                          const char * const indent, SV* const invlist)
10476 {
10477     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10478      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10479      * the string 'indent'.  The output looks like this:
10480          [0] 0x000A .. 0x000D
10481          [2] 0x0085
10482          [4] 0x2028 .. 0x2029
10483          [6] 0x3104 .. INFTY
10484      * This means that the first range of code points matched by the list are
10485      * 0xA through 0xD; the second range contains only the single code point
10486      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10487      * are used to define each range (except if the final range extends to
10488      * infinity, only a single element is needed).  The array index of the
10489      * first element for the corresponding range is given in brackets. */
10490
10491     UV start, end;
10492     STRLEN count = 0;
10493
10494     PERL_ARGS_ASSERT__INVLIST_DUMP;
10495
10496     if (invlist_is_iterating(invlist)) {
10497         Perl_dump_indent(aTHX_ level, file,
10498              "%sCan't dump inversion list because is in middle of iterating\n",
10499              indent);
10500         return;
10501     }
10502
10503     invlist_iterinit(invlist);
10504     while (invlist_iternext(invlist, &start, &end)) {
10505         if (end == UV_MAX) {
10506             Perl_dump_indent(aTHX_ level, file,
10507                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10508                                    indent, (UV)count, start);
10509         }
10510         else if (end != start) {
10511             Perl_dump_indent(aTHX_ level, file,
10512                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10513                                 indent, (UV)count, start,         end);
10514         }
10515         else {
10516             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10517                                             indent, (UV)count, start);
10518         }
10519         count += 2;
10520     }
10521 }
10522
10523 #endif
10524
10525 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10526 bool
10527 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10528 {
10529     /* Return a boolean as to if the two passed in inversion lists are
10530      * identical.  The final argument, if TRUE, says to take the complement of
10531      * the second inversion list before doing the comparison */
10532
10533     const UV len_a = _invlist_len(a);
10534     UV len_b = _invlist_len(b);
10535
10536     const UV* array_a = NULL;
10537     const UV* array_b = NULL;
10538
10539     PERL_ARGS_ASSERT__INVLISTEQ;
10540
10541     /* This code avoids accessing the arrays unless it knows the length is
10542      * non-zero */
10543
10544     if (len_a == 0) {
10545         if (len_b == 0) {
10546             return ! complement_b;
10547         }
10548     }
10549     else {
10550         array_a = invlist_array(a);
10551     }
10552
10553     if (len_b != 0) {
10554         array_b = invlist_array(b);
10555     }
10556
10557     /* If are to compare 'a' with the complement of b, set it
10558      * up so are looking at b's complement. */
10559     if (complement_b) {
10560
10561         /* The complement of nothing is everything, so <a> would have to have
10562          * just one element, starting at zero (ending at infinity) */
10563         if (len_b == 0) {
10564             return (len_a == 1 && array_a[0] == 0);
10565         }
10566         if (array_b[0] == 0) {
10567
10568             /* Otherwise, to complement, we invert.  Here, the first element is
10569              * 0, just remove it.  To do this, we just pretend the array starts
10570              * one later */
10571
10572             array_b++;
10573             len_b--;
10574         }
10575         else {
10576
10577             /* But if the first element is not zero, we pretend the list starts
10578              * at the 0 that is always stored immediately before the array. */
10579             array_b--;
10580             len_b++;
10581         }
10582     }
10583
10584     return    len_a == len_b
10585            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10586
10587 }
10588 #endif
10589
10590 /*
10591  * As best we can, determine the characters that can match the start of
10592  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10593  * can be false positive matches
10594  *
10595  * Returns the invlist as a new SV*; it is the caller's responsibility to
10596  * call SvREFCNT_dec() when done with it.
10597  */
10598 STATIC SV*
10599 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10600 {
10601     dVAR;
10602     const U8 * s = (U8*)STRING(node);
10603     SSize_t bytelen = STR_LEN(node);
10604     UV uc;
10605     /* Start out big enough for 2 separate code points */
10606     SV* invlist = _new_invlist(4);
10607
10608     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10609
10610     if (! UTF) {
10611         uc = *s;
10612
10613         /* We punt and assume can match anything if the node begins
10614          * with a multi-character fold.  Things are complicated.  For
10615          * example, /ffi/i could match any of:
10616          *  "\N{LATIN SMALL LIGATURE FFI}"
10617          *  "\N{LATIN SMALL LIGATURE FF}I"
10618          *  "F\N{LATIN SMALL LIGATURE FI}"
10619          *  plus several other things; and making sure we have all the
10620          *  possibilities is hard. */
10621         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10622             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10623         }
10624         else {
10625             /* Any Latin1 range character can potentially match any
10626              * other depending on the locale, and in Turkic locales, U+130 and
10627              * U+131 */
10628             if (OP(node) == EXACTFL) {
10629                 _invlist_union(invlist, PL_Latin1, &invlist);
10630                 invlist = add_cp_to_invlist(invlist,
10631                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10632                 invlist = add_cp_to_invlist(invlist,
10633                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10634             }
10635             else {
10636                 /* But otherwise, it matches at least itself.  We can
10637                  * quickly tell if it has a distinct fold, and if so,
10638                  * it matches that as well */
10639                 invlist = add_cp_to_invlist(invlist, uc);
10640                 if (IS_IN_SOME_FOLD_L1(uc))
10641                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10642             }
10643
10644             /* Some characters match above-Latin1 ones under /i.  This
10645              * is true of EXACTFL ones when the locale is UTF-8 */
10646             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10647                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10648                                     && OP(node) != EXACTFAA_NO_TRIE)))
10649             {
10650                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10651             }
10652         }
10653     }
10654     else {  /* Pattern is UTF-8 */
10655         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10656         const U8* e = s + bytelen;
10657         IV fc;
10658
10659         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10660
10661         /* The only code points that aren't folded in a UTF EXACTFish
10662          * node are are the problematic ones in EXACTFL nodes */
10663         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10664             /* We need to check for the possibility that this EXACTFL
10665              * node begins with a multi-char fold.  Therefore we fold
10666              * the first few characters of it so that we can make that
10667              * check */
10668             U8 *d = folded;
10669             int i;
10670
10671             fc = -1;
10672             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10673                 if (isASCII(*s)) {
10674                     *(d++) = (U8) toFOLD(*s);
10675                     if (fc < 0) {       /* Save the first fold */
10676                         fc = *(d-1);
10677                     }
10678                     s++;
10679                 }
10680                 else {
10681                     STRLEN len;
10682                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10683                     if (fc < 0) {       /* Save the first fold */
10684                         fc = fold;
10685                     }
10686                     d += len;
10687                     s += UTF8SKIP(s);
10688                 }
10689             }
10690
10691             /* And set up so the code below that looks in this folded
10692              * buffer instead of the node's string */
10693             e = d;
10694             s = folded;
10695         }
10696
10697         /* When we reach here 's' points to the fold of the first
10698          * character(s) of the node; and 'e' points to far enough along
10699          * the folded string to be just past any possible multi-char
10700          * fold.
10701          *
10702          * Unlike the non-UTF-8 case, the macro for determining if a
10703          * string is a multi-char fold requires all the characters to
10704          * already be folded.  This is because of all the complications
10705          * if not.  Note that they are folded anyway, except in EXACTFL
10706          * nodes.  Like the non-UTF case above, we punt if the node
10707          * begins with a multi-char fold  */
10708
10709         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10710             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10711         }
10712         else {  /* Single char fold */
10713             unsigned int k;
10714             unsigned int first_fold;
10715             const unsigned int * remaining_folds;
10716             Size_t folds_count;
10717
10718             /* It matches itself */
10719             invlist = add_cp_to_invlist(invlist, fc);
10720
10721             /* ... plus all the things that fold to it, which are found in
10722              * PL_utf8_foldclosures */
10723             folds_count = _inverse_folds(fc, &first_fold,
10724                                                 &remaining_folds);
10725             for (k = 0; k < folds_count; k++) {
10726                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10727
10728                 /* /aa doesn't allow folds between ASCII and non- */
10729                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10730                     && isASCII(c) != isASCII(fc))
10731                 {
10732                     continue;
10733                 }
10734
10735                 invlist = add_cp_to_invlist(invlist, c);
10736             }
10737
10738             if (OP(node) == EXACTFL) {
10739
10740                 /* If either [iI] are present in an EXACTFL node the above code
10741                  * should have added its normal case pair, but under a Turkish
10742                  * locale they could match instead the case pairs from it.  Add
10743                  * those as potential matches as well */
10744                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10745                     invlist = add_cp_to_invlist(invlist,
10746                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10747                     invlist = add_cp_to_invlist(invlist,
10748                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10749                 }
10750                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10751                     invlist = add_cp_to_invlist(invlist, 'I');
10752                 }
10753                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10754                     invlist = add_cp_to_invlist(invlist, 'i');
10755                 }
10756             }
10757         }
10758     }
10759
10760     return invlist;
10761 }
10762
10763 #undef HEADER_LENGTH
10764 #undef TO_INTERNAL_SIZE
10765 #undef FROM_INTERNAL_SIZE
10766 #undef INVLIST_VERSION_ID
10767
10768 /* End of inversion list object */
10769
10770 STATIC void
10771 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10772 {
10773     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10774      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10775      * should point to the first flag; it is updated on output to point to the
10776      * final ')' or ':'.  There needs to be at least one flag, or this will
10777      * abort */
10778
10779     /* for (?g), (?gc), and (?o) warnings; warning
10780        about (?c) will warn about (?g) -- japhy    */
10781
10782 #define WASTED_O  0x01
10783 #define WASTED_G  0x02
10784 #define WASTED_C  0x04
10785 #define WASTED_GC (WASTED_G|WASTED_C)
10786     I32 wastedflags = 0x00;
10787     U32 posflags = 0, negflags = 0;
10788     U32 *flagsp = &posflags;
10789     char has_charset_modifier = '\0';
10790     regex_charset cs;
10791     bool has_use_defaults = FALSE;
10792     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10793     int x_mod_count = 0;
10794
10795     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10796
10797     /* '^' as an initial flag sets certain defaults */
10798     if (UCHARAT(RExC_parse) == '^') {
10799         RExC_parse++;
10800         has_use_defaults = TRUE;
10801         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10802         cs = (RExC_uni_semantics)
10803              ? REGEX_UNICODE_CHARSET
10804              : REGEX_DEPENDS_CHARSET;
10805         set_regex_charset(&RExC_flags, cs);
10806     }
10807     else {
10808         cs = get_regex_charset(RExC_flags);
10809         if (   cs == REGEX_DEPENDS_CHARSET
10810             && RExC_uni_semantics)
10811         {
10812             cs = REGEX_UNICODE_CHARSET;
10813         }
10814     }
10815
10816     while (RExC_parse < RExC_end) {
10817         /* && strchr("iogcmsx", *RExC_parse) */
10818         /* (?g), (?gc) and (?o) are useless here
10819            and must be globally applied -- japhy */
10820         switch (*RExC_parse) {
10821
10822             /* Code for the imsxn flags */
10823             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10824
10825             case LOCALE_PAT_MOD:
10826                 if (has_charset_modifier) {
10827                     goto excess_modifier;
10828                 }
10829                 else if (flagsp == &negflags) {
10830                     goto neg_modifier;
10831                 }
10832                 cs = REGEX_LOCALE_CHARSET;
10833                 has_charset_modifier = LOCALE_PAT_MOD;
10834                 break;
10835             case UNICODE_PAT_MOD:
10836                 if (has_charset_modifier) {
10837                     goto excess_modifier;
10838                 }
10839                 else if (flagsp == &negflags) {
10840                     goto neg_modifier;
10841                 }
10842                 cs = REGEX_UNICODE_CHARSET;
10843                 has_charset_modifier = UNICODE_PAT_MOD;
10844                 break;
10845             case ASCII_RESTRICT_PAT_MOD:
10846                 if (flagsp == &negflags) {
10847                     goto neg_modifier;
10848                 }
10849                 if (has_charset_modifier) {
10850                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10851                         goto excess_modifier;
10852                     }
10853                     /* Doubled modifier implies more restricted */
10854                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10855                 }
10856                 else {
10857                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10858                 }
10859                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10860                 break;
10861             case DEPENDS_PAT_MOD:
10862                 if (has_use_defaults) {
10863                     goto fail_modifiers;
10864                 }
10865                 else if (flagsp == &negflags) {
10866                     goto neg_modifier;
10867                 }
10868                 else if (has_charset_modifier) {
10869                     goto excess_modifier;
10870                 }
10871
10872                 /* The dual charset means unicode semantics if the
10873                  * pattern (or target, not known until runtime) are
10874                  * utf8, or something in the pattern indicates unicode
10875                  * semantics */
10876                 cs = (RExC_uni_semantics)
10877                      ? REGEX_UNICODE_CHARSET
10878                      : REGEX_DEPENDS_CHARSET;
10879                 has_charset_modifier = DEPENDS_PAT_MOD;
10880                 break;
10881               excess_modifier:
10882                 RExC_parse++;
10883                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10884                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10885                 }
10886                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10887                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10888                                         *(RExC_parse - 1));
10889                 }
10890                 else {
10891                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10892                 }
10893                 NOT_REACHED; /*NOTREACHED*/
10894               neg_modifier:
10895                 RExC_parse++;
10896                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10897                                     *(RExC_parse - 1));
10898                 NOT_REACHED; /*NOTREACHED*/
10899             case ONCE_PAT_MOD: /* 'o' */
10900             case GLOBAL_PAT_MOD: /* 'g' */
10901                 if (ckWARN(WARN_REGEXP)) {
10902                     const I32 wflagbit = *RExC_parse == 'o'
10903                                          ? WASTED_O
10904                                          : WASTED_G;
10905                     if (! (wastedflags & wflagbit) ) {
10906                         wastedflags |= wflagbit;
10907                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10908                         vWARN5(
10909                             RExC_parse + 1,
10910                             "Useless (%s%c) - %suse /%c modifier",
10911                             flagsp == &negflags ? "?-" : "?",
10912                             *RExC_parse,
10913                             flagsp == &negflags ? "don't " : "",
10914                             *RExC_parse
10915                         );
10916                     }
10917                 }
10918                 break;
10919
10920             case CONTINUE_PAT_MOD: /* 'c' */
10921                 if (ckWARN(WARN_REGEXP)) {
10922                     if (! (wastedflags & WASTED_C) ) {
10923                         wastedflags |= WASTED_GC;
10924                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10925                         vWARN3(
10926                             RExC_parse + 1,
10927                             "Useless (%sc) - %suse /gc modifier",
10928                             flagsp == &negflags ? "?-" : "?",
10929                             flagsp == &negflags ? "don't " : ""
10930                         );
10931                     }
10932                 }
10933                 break;
10934             case KEEPCOPY_PAT_MOD: /* 'p' */
10935                 if (flagsp == &negflags) {
10936                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10937                 } else {
10938                     *flagsp |= RXf_PMf_KEEPCOPY;
10939                 }
10940                 break;
10941             case '-':
10942                 /* A flag is a default iff it is following a minus, so
10943                  * if there is a minus, it means will be trying to
10944                  * re-specify a default which is an error */
10945                 if (has_use_defaults || flagsp == &negflags) {
10946                     goto fail_modifiers;
10947                 }
10948                 flagsp = &negflags;
10949                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10950                 x_mod_count = 0;
10951                 break;
10952             case ':':
10953             case ')':
10954
10955                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10956                     negflags |= RXf_PMf_EXTENDED_MORE;
10957                 }
10958                 RExC_flags |= posflags;
10959
10960                 if (negflags & RXf_PMf_EXTENDED) {
10961                     negflags |= RXf_PMf_EXTENDED_MORE;
10962                 }
10963                 RExC_flags &= ~negflags;
10964                 set_regex_charset(&RExC_flags, cs);
10965
10966                 return;
10967             default:
10968               fail_modifiers:
10969                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
10970                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10971                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10972                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10973                 NOT_REACHED; /*NOTREACHED*/
10974         }
10975
10976         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10977     }
10978
10979     vFAIL("Sequence (?... not terminated");
10980 }
10981
10982 /*
10983  - reg - regular expression, i.e. main body or parenthesized thing
10984  *
10985  * Caller must absorb opening parenthesis.
10986  *
10987  * Combining parenthesis handling with the base level of regular expression
10988  * is a trifle forced, but the need to tie the tails of the branches to what
10989  * follows makes it hard to avoid.
10990  */
10991 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10992 #ifdef DEBUGGING
10993 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10994 #else
10995 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10996 #endif
10997
10998 PERL_STATIC_INLINE regnode_offset
10999 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11000                              I32 *flagp,
11001                              char * parse_start,
11002                              char ch
11003                       )
11004 {
11005     regnode_offset ret;
11006     char* name_start = RExC_parse;
11007     U32 num = 0;
11008     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11009     GET_RE_DEBUG_FLAGS_DECL;
11010
11011     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11012
11013     if (RExC_parse == name_start || *RExC_parse != ch) {
11014         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11015         vFAIL2("Sequence %.3s... not terminated", parse_start);
11016     }
11017
11018     if (sv_dat) {
11019         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11020         RExC_rxi->data->data[num]=(void*)sv_dat;
11021         SvREFCNT_inc_simple_void_NN(sv_dat);
11022     }
11023     RExC_sawback = 1;
11024     ret = reganode(pRExC_state,
11025                    ((! FOLD)
11026                      ? REFN
11027                      : (ASCII_FOLD_RESTRICTED)
11028                        ? REFFAN
11029                        : (AT_LEAST_UNI_SEMANTICS)
11030                          ? REFFUN
11031                          : (LOC)
11032                            ? REFFLN
11033                            : REFFN),
11034                     num);
11035     *flagp |= HASWIDTH;
11036
11037     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11038     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11039
11040     nextchar(pRExC_state);
11041     return ret;
11042 }
11043
11044 /* On success, returns the offset at which any next node should be placed into
11045  * the regex engine program being compiled.
11046  *
11047  * Returns 0 otherwise, with *flagp set to indicate why:
11048  *  TRYAGAIN        at the end of (?) that only sets flags.
11049  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11050  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11051  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11052  *  happen.  */
11053 STATIC regnode_offset
11054 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11055     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11056      * 2 is like 1, but indicates that nextchar() has been called to advance
11057      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11058      * this flag alerts us to the need to check for that */
11059 {
11060     regnode_offset ret = 0;    /* Will be the head of the group. */
11061     regnode_offset br;
11062     regnode_offset lastbr;
11063     regnode_offset ender = 0;
11064     I32 parno = 0;
11065     I32 flags;
11066     U32 oregflags = RExC_flags;
11067     bool have_branch = 0;
11068     bool is_open = 0;
11069     I32 freeze_paren = 0;
11070     I32 after_freeze = 0;
11071     I32 num; /* numeric backreferences */
11072     SV * max_open;  /* Max number of unclosed parens */
11073
11074     char * parse_start = RExC_parse; /* MJD */
11075     char * const oregcomp_parse = RExC_parse;
11076
11077     GET_RE_DEBUG_FLAGS_DECL;
11078
11079     PERL_ARGS_ASSERT_REG;
11080     DEBUG_PARSE("reg ");
11081
11082
11083     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11084     assert(max_open);
11085     if (!SvIOK(max_open)) {
11086         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11087     }
11088     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11089                                               open paren */
11090         vFAIL("Too many nested open parens");
11091     }
11092
11093     *flagp = 0;                         /* Tentatively. */
11094
11095     if (RExC_in_lookbehind) {
11096         RExC_in_lookbehind++;
11097     }
11098     if (RExC_in_lookahead) {
11099         RExC_in_lookahead++;
11100     }
11101
11102     /* Having this true makes it feasible to have a lot fewer tests for the
11103      * parse pointer being in scope.  For example, we can write
11104      *      while(isFOO(*RExC_parse)) RExC_parse++;
11105      * instead of
11106      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11107      */
11108     assert(*RExC_end == '\0');
11109
11110     /* Make an OPEN node, if parenthesized. */
11111     if (paren) {
11112
11113         /* Under /x, space and comments can be gobbled up between the '(' and
11114          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11115          * intervening space, as the sequence is a token, and a token should be
11116          * indivisible */
11117         bool has_intervening_patws = (paren == 2)
11118                                   && *(RExC_parse - 1) != '(';
11119
11120         if (RExC_parse >= RExC_end) {
11121             vFAIL("Unmatched (");
11122         }
11123
11124         if (paren == 'r') {     /* Atomic script run */
11125             paren = '>';
11126             goto parse_rest;
11127         }
11128         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11129             char *start_verb = RExC_parse + 1;
11130             STRLEN verb_len;
11131             char *start_arg = NULL;
11132             unsigned char op = 0;
11133             int arg_required = 0;
11134             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11135             bool has_upper = FALSE;
11136
11137             if (has_intervening_patws) {
11138                 RExC_parse++;   /* past the '*' */
11139
11140                 /* For strict backwards compatibility, don't change the message
11141                  * now that we also have lowercase operands */
11142                 if (isUPPER(*RExC_parse)) {
11143                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11144                 }
11145                 else {
11146                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11147                 }
11148             }
11149             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11150                 if ( *RExC_parse == ':' ) {
11151                     start_arg = RExC_parse + 1;
11152                     break;
11153                 }
11154                 else if (! UTF) {
11155                     if (isUPPER(*RExC_parse)) {
11156                         has_upper = TRUE;
11157                     }
11158                     RExC_parse++;
11159                 }
11160                 else {
11161                     RExC_parse += UTF8SKIP(RExC_parse);
11162                 }
11163             }
11164             verb_len = RExC_parse - start_verb;
11165             if ( start_arg ) {
11166                 if (RExC_parse >= RExC_end) {
11167                     goto unterminated_verb_pattern;
11168                 }
11169
11170                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11171                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11172                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11173                 }
11174                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11175                   unterminated_verb_pattern:
11176                     if (has_upper) {
11177                         vFAIL("Unterminated verb pattern argument");
11178                     }
11179                     else {
11180                         vFAIL("Unterminated '(*...' argument");
11181                     }
11182                 }
11183             } else {
11184                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11185                     if (has_upper) {
11186                         vFAIL("Unterminated verb pattern");
11187                     }
11188                     else {
11189                         vFAIL("Unterminated '(*...' construct");
11190                     }
11191                 }
11192             }
11193
11194             /* Here, we know that RExC_parse < RExC_end */
11195
11196             switch ( *start_verb ) {
11197             case 'A':  /* (*ACCEPT) */
11198                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11199                     op = ACCEPT;
11200                     internal_argval = RExC_nestroot;
11201                 }
11202                 break;
11203             case 'C':  /* (*COMMIT) */
11204                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11205                     op = COMMIT;
11206                 break;
11207             case 'F':  /* (*FAIL) */
11208                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11209                     op = OPFAIL;
11210                 }
11211                 break;
11212             case ':':  /* (*:NAME) */
11213             case 'M':  /* (*MARK:NAME) */
11214                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11215                     op = MARKPOINT;
11216                     arg_required = 1;
11217                 }
11218                 break;
11219             case 'P':  /* (*PRUNE) */
11220                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11221                     op = PRUNE;
11222                 break;
11223             case 'S':   /* (*SKIP) */
11224                 if ( memEQs(start_verb, verb_len,"SKIP") )
11225                     op = SKIP;
11226                 break;
11227             case 'T':  /* (*THEN) */
11228                 /* [19:06] <TimToady> :: is then */
11229                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11230                     op = CUTGROUP;
11231                     RExC_seen |= REG_CUTGROUP_SEEN;
11232                 }
11233                 break;
11234             case 'a':
11235                 if (   memEQs(start_verb, verb_len, "asr")
11236                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11237                 {
11238                     paren = 'r';        /* Mnemonic: recursed run */
11239                     goto script_run;
11240                 }
11241                 else if (memEQs(start_verb, verb_len, "atomic")) {
11242                     paren = 't';    /* AtOMIC */
11243                     goto alpha_assertions;
11244                 }
11245                 break;
11246             case 'p':
11247                 if (   memEQs(start_verb, verb_len, "plb")
11248                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11249                 {
11250                     paren = 'b';
11251                     goto lookbehind_alpha_assertions;
11252                 }
11253                 else if (   memEQs(start_verb, verb_len, "pla")
11254                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11255                 {
11256                     paren = 'a';
11257                     goto alpha_assertions;
11258                 }
11259                 break;
11260             case 'n':
11261                 if (   memEQs(start_verb, verb_len, "nlb")
11262                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11263                 {
11264                     paren = 'B';
11265                     goto lookbehind_alpha_assertions;
11266                 }
11267                 else if (   memEQs(start_verb, verb_len, "nla")
11268                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11269                 {
11270                     paren = 'A';
11271                     goto alpha_assertions;
11272                 }
11273                 break;
11274             case 's':
11275                 if (   memEQs(start_verb, verb_len, "sr")
11276                     || memEQs(start_verb, verb_len, "script_run"))
11277                 {
11278                     regnode_offset atomic;
11279
11280                     paren = 's';
11281
11282                    script_run:
11283
11284                     /* This indicates Unicode rules. */
11285                     REQUIRE_UNI_RULES(flagp, 0);
11286
11287                     if (! start_arg) {
11288                         goto no_colon;
11289                     }
11290
11291                     RExC_parse = start_arg;
11292
11293                     if (RExC_in_script_run) {
11294
11295                         /*  Nested script runs are treated as no-ops, because
11296                          *  if the nested one fails, the outer one must as
11297                          *  well.  It could fail sooner, and avoid (??{} with
11298                          *  side effects, but that is explicitly documented as
11299                          *  undefined behavior. */
11300
11301                         ret = 0;
11302
11303                         if (paren == 's') {
11304                             paren = ':';
11305                             goto parse_rest;
11306                         }
11307
11308                         /* But, the atomic part of a nested atomic script run
11309                          * isn't a no-op, but can be treated just like a '(?>'
11310                          * */
11311                         paren = '>';
11312                         goto parse_rest;
11313                     }
11314
11315                     /* By doing this here, we avoid extra warnings for nested
11316                      * script runs */
11317                     ckWARNexperimental(RExC_parse,
11318                         WARN_EXPERIMENTAL__SCRIPT_RUN,
11319                         "The script_run feature is experimental");
11320
11321                     if (paren == 's') {
11322                         /* Here, we're starting a new regular script run */
11323                         ret = reg_node(pRExC_state, SROPEN);
11324                         RExC_in_script_run = 1;
11325                         is_open = 1;
11326                         goto parse_rest;
11327                     }
11328
11329                     /* Here, we are starting an atomic script run.  This is
11330                      * handled by recursing to deal with the atomic portion
11331                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11332
11333                     ret = reg_node(pRExC_state, SROPEN);
11334
11335                     RExC_in_script_run = 1;
11336
11337                     atomic = reg(pRExC_state, 'r', &flags, depth);
11338                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11339                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11340                         return 0;
11341                     }
11342
11343                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11344                         REQUIRE_BRANCHJ(flagp, 0);
11345                     }
11346
11347                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11348                                                                 SRCLOSE)))
11349                     {
11350                         REQUIRE_BRANCHJ(flagp, 0);
11351                     }
11352
11353                     RExC_in_script_run = 0;
11354                     return ret;
11355                 }
11356
11357                 break;
11358
11359             lookbehind_alpha_assertions:
11360                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11361                 RExC_in_lookbehind++;
11362                 /*FALLTHROUGH*/
11363
11364             alpha_assertions:
11365                 ckWARNexperimental(RExC_parse,
11366                         WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
11367                         "The alpha_assertions feature is experimental");
11368
11369                 RExC_seen_zerolen++;
11370
11371                 if (! start_arg) {
11372                     goto no_colon;
11373                 }
11374
11375                 /* An empty negative lookahead assertion simply is failure */
11376                 if (paren == 'A' && RExC_parse == start_arg) {
11377                     ret=reganode(pRExC_state, OPFAIL, 0);
11378                     nextchar(pRExC_state);
11379                     return ret;
11380                 }
11381
11382                 RExC_parse = start_arg;
11383                 goto parse_rest;
11384
11385               no_colon:
11386                 vFAIL2utf8f(
11387                 "'(*%" UTF8f "' requires a terminating ':'",
11388                 UTF8fARG(UTF, verb_len, start_verb));
11389                 NOT_REACHED; /*NOTREACHED*/
11390
11391             } /* End of switch */
11392             if ( ! op ) {
11393                 RExC_parse += UTF
11394                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11395                               : 1;
11396                 if (has_upper || verb_len == 0) {
11397                     vFAIL2utf8f(
11398                     "Unknown verb pattern '%" UTF8f "'",
11399                     UTF8fARG(UTF, verb_len, start_verb));
11400                 }
11401                 else {
11402                     vFAIL2utf8f(
11403                     "Unknown '(*...)' construct '%" UTF8f "'",
11404                     UTF8fARG(UTF, verb_len, start_verb));
11405                 }
11406             }
11407             if ( RExC_parse == start_arg ) {
11408                 start_arg = NULL;
11409             }
11410             if ( arg_required && !start_arg ) {
11411                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11412                     verb_len, start_verb);
11413             }
11414             if (internal_argval == -1) {
11415                 ret = reganode(pRExC_state, op, 0);
11416             } else {
11417                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11418             }
11419             RExC_seen |= REG_VERBARG_SEEN;
11420             if (start_arg) {
11421                 SV *sv = newSVpvn( start_arg,
11422                                     RExC_parse - start_arg);
11423                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11424                                         STR_WITH_LEN("S"));
11425                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11426                 FLAGS(REGNODE_p(ret)) = 1;
11427             } else {
11428                 FLAGS(REGNODE_p(ret)) = 0;
11429             }
11430             if ( internal_argval != -1 )
11431                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11432             nextchar(pRExC_state);
11433             return ret;
11434         }
11435         else if (*RExC_parse == '?') { /* (?...) */
11436             bool is_logical = 0;
11437             const char * const seqstart = RExC_parse;
11438             const char * endptr;
11439             if (has_intervening_patws) {
11440                 RExC_parse++;
11441                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11442             }
11443
11444             RExC_parse++;           /* past the '?' */
11445             paren = *RExC_parse;    /* might be a trailing NUL, if not
11446                                        well-formed */
11447             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11448             if (RExC_parse > RExC_end) {
11449                 paren = '\0';
11450             }
11451             ret = 0;                    /* For look-ahead/behind. */
11452             switch (paren) {
11453
11454             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11455                 paren = *RExC_parse;
11456                 if ( paren == '<') {    /* (?P<...>) named capture */
11457                     RExC_parse++;
11458                     if (RExC_parse >= RExC_end) {
11459                         vFAIL("Sequence (?P<... not terminated");
11460                     }
11461                     goto named_capture;
11462                 }
11463                 else if (paren == '>') {   /* (?P>name) named recursion */
11464                     RExC_parse++;
11465                     if (RExC_parse >= RExC_end) {
11466                         vFAIL("Sequence (?P>... not terminated");
11467                     }
11468                     goto named_recursion;
11469                 }
11470                 else if (paren == '=') {   /* (?P=...)  named backref */
11471                     RExC_parse++;
11472                     return handle_named_backref(pRExC_state, flagp,
11473                                                 parse_start, ')');
11474                 }
11475                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11476                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11477                 vFAIL3("Sequence (%.*s...) not recognized",
11478                                 RExC_parse-seqstart, seqstart);
11479                 NOT_REACHED; /*NOTREACHED*/
11480             case '<':           /* (?<...) */
11481                 if (*RExC_parse == '!')
11482                     paren = ',';
11483                 else if (*RExC_parse != '=')
11484               named_capture:
11485                 {               /* (?<...>) */
11486                     char *name_start;
11487                     SV *svname;
11488                     paren= '>';
11489                 /* FALLTHROUGH */
11490             case '\'':          /* (?'...') */
11491                     name_start = RExC_parse;
11492                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11493                     if (   RExC_parse == name_start
11494                         || RExC_parse >= RExC_end
11495                         || *RExC_parse != paren)
11496                     {
11497                         vFAIL2("Sequence (?%c... not terminated",
11498                             paren=='>' ? '<' : paren);
11499                     }
11500                     {
11501                         HE *he_str;
11502                         SV *sv_dat = NULL;
11503                         if (!svname) /* shouldn't happen */
11504                             Perl_croak(aTHX_
11505                                 "panic: reg_scan_name returned NULL");
11506                         if (!RExC_paren_names) {
11507                             RExC_paren_names= newHV();
11508                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11509 #ifdef DEBUGGING
11510                             RExC_paren_name_list= newAV();
11511                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11512 #endif
11513                         }
11514                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11515                         if ( he_str )
11516                             sv_dat = HeVAL(he_str);
11517                         if ( ! sv_dat ) {
11518                             /* croak baby croak */
11519                             Perl_croak(aTHX_
11520                                 "panic: paren_name hash element allocation failed");
11521                         } else if ( SvPOK(sv_dat) ) {
11522                             /* (?|...) can mean we have dupes so scan to check
11523                                its already been stored. Maybe a flag indicating
11524                                we are inside such a construct would be useful,
11525                                but the arrays are likely to be quite small, so
11526                                for now we punt -- dmq */
11527                             IV count = SvIV(sv_dat);
11528                             I32 *pv = (I32*)SvPVX(sv_dat);
11529                             IV i;
11530                             for ( i = 0 ; i < count ; i++ ) {
11531                                 if ( pv[i] == RExC_npar ) {
11532                                     count = 0;
11533                                     break;
11534                                 }
11535                             }
11536                             if ( count ) {
11537                                 pv = (I32*)SvGROW(sv_dat,
11538                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11539                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11540                                 pv[count] = RExC_npar;
11541                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11542                             }
11543                         } else {
11544                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11545                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11546                                                                 sizeof(I32));
11547                             SvIOK_on(sv_dat);
11548                             SvIV_set(sv_dat, 1);
11549                         }
11550 #ifdef DEBUGGING
11551                         /* Yes this does cause a memory leak in debugging Perls
11552                          * */
11553                         if (!av_store(RExC_paren_name_list,
11554                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11555                             SvREFCNT_dec_NN(svname);
11556 #endif
11557
11558                         /*sv_dump(sv_dat);*/
11559                     }
11560                     nextchar(pRExC_state);
11561                     paren = 1;
11562                     goto capturing_parens;
11563                 }
11564
11565                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11566                 RExC_in_lookbehind++;
11567                 RExC_parse++;
11568                 if (RExC_parse >= RExC_end) {
11569                     vFAIL("Sequence (?... not terminated");
11570                 }
11571                 RExC_seen_zerolen++;
11572                 break;
11573             case '=':           /* (?=...) */
11574                 RExC_seen_zerolen++;
11575                 RExC_in_lookahead++;
11576                 break;
11577             case '!':           /* (?!...) */
11578                 RExC_seen_zerolen++;
11579                 /* check if we're really just a "FAIL" assertion */
11580                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11581                                         FALSE /* Don't force to /x */ );
11582                 if (*RExC_parse == ')') {
11583                     ret=reganode(pRExC_state, OPFAIL, 0);
11584                     nextchar(pRExC_state);
11585                     return ret;
11586                 }
11587                 break;
11588             case '|':           /* (?|...) */
11589                 /* branch reset, behave like a (?:...) except that
11590                    buffers in alternations share the same numbers */
11591                 paren = ':';
11592                 after_freeze = freeze_paren = RExC_npar;
11593
11594                 /* XXX This construct currently requires an extra pass.
11595                  * Investigation would be required to see if that could be
11596                  * changed */
11597                 REQUIRE_PARENS_PASS;
11598                 break;
11599             case ':':           /* (?:...) */
11600             case '>':           /* (?>...) */
11601                 break;
11602             case '$':           /* (?$...) */
11603             case '@':           /* (?@...) */
11604                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11605                 break;
11606             case '0' :           /* (?0) */
11607             case 'R' :           /* (?R) */
11608                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11609                     FAIL("Sequence (?R) not terminated");
11610                 num = 0;
11611                 RExC_seen |= REG_RECURSE_SEEN;
11612
11613                 /* XXX These constructs currently require an extra pass.
11614                  * It probably could be changed */
11615                 REQUIRE_PARENS_PASS;
11616
11617                 *flagp |= POSTPONED;
11618                 goto gen_recurse_regop;
11619                 /*notreached*/
11620             /* named and numeric backreferences */
11621             case '&':            /* (?&NAME) */
11622                 parse_start = RExC_parse - 1;
11623               named_recursion:
11624                 {
11625                     SV *sv_dat = reg_scan_name(pRExC_state,
11626                                                REG_RSN_RETURN_DATA);
11627                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11628                 }
11629                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11630                     vFAIL("Sequence (?&... not terminated");
11631                 goto gen_recurse_regop;
11632                 /* NOTREACHED */
11633             case '+':
11634                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11635                     RExC_parse++;
11636                     vFAIL("Illegal pattern");
11637                 }
11638                 goto parse_recursion;
11639                 /* NOTREACHED*/
11640             case '-': /* (?-1) */
11641                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11642                     RExC_parse--; /* rewind to let it be handled later */
11643                     goto parse_flags;
11644                 }
11645                 /* FALLTHROUGH */
11646             case '1': case '2': case '3': case '4': /* (?1) */
11647             case '5': case '6': case '7': case '8': case '9':
11648                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11649               parse_recursion:
11650                 {
11651                     bool is_neg = FALSE;
11652                     UV unum;
11653                     parse_start = RExC_parse - 1; /* MJD */
11654                     if (*RExC_parse == '-') {
11655                         RExC_parse++;
11656                         is_neg = TRUE;
11657                     }
11658                     endptr = RExC_end;
11659                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11660                         && unum <= I32_MAX
11661                     ) {
11662                         num = (I32)unum;
11663                         RExC_parse = (char*)endptr;
11664                     } else
11665                         num = I32_MAX;
11666                     if (is_neg) {
11667                         /* Some limit for num? */
11668                         num = -num;
11669                     }
11670                 }
11671                 if (*RExC_parse!=')')
11672                     vFAIL("Expecting close bracket");
11673
11674               gen_recurse_regop:
11675                 if ( paren == '-' ) {
11676                     /*
11677                     Diagram of capture buffer numbering.
11678                     Top line is the normal capture buffer numbers
11679                     Bottom line is the negative indexing as from
11680                     the X (the (?-2))
11681
11682                     +   1 2    3 4 5 X          6 7
11683                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11684                     -   5 4    3 2 1 X          x x
11685
11686                     */
11687                     num = RExC_npar + num;
11688                     if (num < 1)  {
11689
11690                         /* It might be a forward reference; we can't fail until
11691                          * we know, by completing the parse to get all the
11692                          * groups, and then reparsing */
11693                         if (ALL_PARENS_COUNTED)  {
11694                             RExC_parse++;
11695                             vFAIL("Reference to nonexistent group");
11696                         }
11697                         else {
11698                             REQUIRE_PARENS_PASS;
11699                         }
11700                     }
11701                 } else if ( paren == '+' ) {
11702                     num = RExC_npar + num - 1;
11703                 }
11704                 /* We keep track how many GOSUB items we have produced.
11705                    To start off the ARG2L() of the GOSUB holds its "id",
11706                    which is used later in conjunction with RExC_recurse
11707                    to calculate the offset we need to jump for the GOSUB,
11708                    which it will store in the final representation.
11709                    We have to defer the actual calculation until much later
11710                    as the regop may move.
11711                  */
11712
11713                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11714                 if (num >= RExC_npar) {
11715
11716                     /* It might be a forward reference; we can't fail until we
11717                      * know, by completing the parse to get all the groups, and
11718                      * then reparsing */
11719                     if (ALL_PARENS_COUNTED)  {
11720                         if (num >= RExC_total_parens) {
11721                             RExC_parse++;
11722                             vFAIL("Reference to nonexistent group");
11723                         }
11724                     }
11725                     else {
11726                         REQUIRE_PARENS_PASS;
11727                     }
11728                 }
11729                 RExC_recurse_count++;
11730                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11731                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11732                             22, "|    |", (int)(depth * 2 + 1), "",
11733                             (UV)ARG(REGNODE_p(ret)),
11734                             (IV)ARG2L(REGNODE_p(ret))));
11735                 RExC_seen |= REG_RECURSE_SEEN;
11736
11737                 Set_Node_Length(REGNODE_p(ret),
11738                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11739                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11740
11741                 *flagp |= POSTPONED;
11742                 assert(*RExC_parse == ')');
11743                 nextchar(pRExC_state);
11744                 return ret;
11745
11746             /* NOTREACHED */
11747
11748             case '?':           /* (??...) */
11749                 is_logical = 1;
11750                 if (*RExC_parse != '{') {
11751                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11752                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11753                     vFAIL2utf8f(
11754                         "Sequence (%" UTF8f "...) not recognized",
11755                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11756                     NOT_REACHED; /*NOTREACHED*/
11757                 }
11758                 *flagp |= POSTPONED;
11759                 paren = '{';
11760                 RExC_parse++;
11761                 /* FALLTHROUGH */
11762             case '{':           /* (?{...}) */
11763             {
11764                 U32 n = 0;
11765                 struct reg_code_block *cb;
11766                 OP * o;
11767
11768                 RExC_seen_zerolen++;
11769
11770                 if (   !pRExC_state->code_blocks
11771                     || pRExC_state->code_index
11772                                         >= pRExC_state->code_blocks->count
11773                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11774                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11775                             - RExC_start)
11776                 ) {
11777                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11778                         FAIL("panic: Sequence (?{...}): no code block found\n");
11779                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11780                 }
11781                 /* this is a pre-compiled code block (?{...}) */
11782                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11783                 RExC_parse = RExC_start + cb->end;
11784                 o = cb->block;
11785                 if (cb->src_regex) {
11786                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11787                     RExC_rxi->data->data[n] =
11788                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11789                     RExC_rxi->data->data[n+1] = (void*)o;
11790                 }
11791                 else {
11792                     n = add_data(pRExC_state,
11793                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11794                     RExC_rxi->data->data[n] = (void*)o;
11795                 }
11796                 pRExC_state->code_index++;
11797                 nextchar(pRExC_state);
11798
11799                 if (is_logical) {
11800                     regnode_offset eval;
11801                     ret = reg_node(pRExC_state, LOGICAL);
11802
11803                     eval = reg2Lanode(pRExC_state, EVAL,
11804                                        n,
11805
11806                                        /* for later propagation into (??{})
11807                                         * return value */
11808                                        RExC_flags & RXf_PMf_COMPILETIME
11809                                       );
11810                     FLAGS(REGNODE_p(ret)) = 2;
11811                     if (! REGTAIL(pRExC_state, ret, eval)) {
11812                         REQUIRE_BRANCHJ(flagp, 0);
11813                     }
11814                     /* deal with the length of this later - MJD */
11815                     return ret;
11816                 }
11817                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11818                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11819                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11820                 return ret;
11821             }
11822             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11823             {
11824                 int is_define= 0;
11825                 const int DEFINE_len = sizeof("DEFINE") - 1;
11826                 if (    RExC_parse < RExC_end - 1
11827                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11828                             && (   RExC_parse[1] == '='
11829                                 || RExC_parse[1] == '!'
11830                                 || RExC_parse[1] == '<'
11831                                 || RExC_parse[1] == '{'))
11832                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11833                             && (   memBEGINs(RExC_parse + 1,
11834                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11835                                          "pla:")
11836                                 || memBEGINs(RExC_parse + 1,
11837                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11838                                          "plb:")
11839                                 || memBEGINs(RExC_parse + 1,
11840                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11841                                          "nla:")
11842                                 || memBEGINs(RExC_parse + 1,
11843                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11844                                          "nlb:")
11845                                 || memBEGINs(RExC_parse + 1,
11846                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11847                                          "positive_lookahead:")
11848                                 || memBEGINs(RExC_parse + 1,
11849                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11850                                          "positive_lookbehind:")
11851                                 || memBEGINs(RExC_parse + 1,
11852                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11853                                          "negative_lookahead:")
11854                                 || memBEGINs(RExC_parse + 1,
11855                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11856                                          "negative_lookbehind:"))))
11857                 ) { /* Lookahead or eval. */
11858                     I32 flag;
11859                     regnode_offset tail;
11860
11861                     ret = reg_node(pRExC_state, LOGICAL);
11862                     FLAGS(REGNODE_p(ret)) = 1;
11863
11864                     tail = reg(pRExC_state, 1, &flag, depth+1);
11865                     RETURN_FAIL_ON_RESTART(flag, flagp);
11866                     if (! REGTAIL(pRExC_state, ret, tail)) {
11867                         REQUIRE_BRANCHJ(flagp, 0);
11868                     }
11869                     goto insert_if;
11870                 }
11871                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11872                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11873                 {
11874                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11875                     char *name_start= RExC_parse++;
11876                     U32 num = 0;
11877                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11878                     if (   RExC_parse == name_start
11879                         || RExC_parse >= RExC_end
11880                         || *RExC_parse != ch)
11881                     {
11882                         vFAIL2("Sequence (?(%c... not terminated",
11883                             (ch == '>' ? '<' : ch));
11884                     }
11885                     RExC_parse++;
11886                     if (sv_dat) {
11887                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11888                         RExC_rxi->data->data[num]=(void*)sv_dat;
11889                         SvREFCNT_inc_simple_void_NN(sv_dat);
11890                     }
11891                     ret = reganode(pRExC_state, GROUPPN, num);
11892                     goto insert_if_check_paren;
11893                 }
11894                 else if (memBEGINs(RExC_parse,
11895                                    (STRLEN) (RExC_end - RExC_parse),
11896                                    "DEFINE"))
11897                 {
11898                     ret = reganode(pRExC_state, DEFINEP, 0);
11899                     RExC_parse += DEFINE_len;
11900                     is_define = 1;
11901                     goto insert_if_check_paren;
11902                 }
11903                 else if (RExC_parse[0] == 'R') {
11904                     RExC_parse++;
11905                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11906                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11907                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11908                      */
11909                     parno = 0;
11910                     if (RExC_parse[0] == '0') {
11911                         parno = 1;
11912                         RExC_parse++;
11913                     }
11914                     else if (inRANGE(RExC_parse[0], '1', '9')) {
11915                         UV uv;
11916                         endptr = RExC_end;
11917                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11918                             && uv <= I32_MAX
11919                         ) {
11920                             parno = (I32)uv + 1;
11921                             RExC_parse = (char*)endptr;
11922                         }
11923                         /* else "Switch condition not recognized" below */
11924                     } else if (RExC_parse[0] == '&') {
11925                         SV *sv_dat;
11926                         RExC_parse++;
11927                         sv_dat = reg_scan_name(pRExC_state,
11928                                                REG_RSN_RETURN_DATA);
11929                         if (sv_dat)
11930                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11931                     }
11932                     ret = reganode(pRExC_state, INSUBP, parno);
11933                     goto insert_if_check_paren;
11934                 }
11935                 else if (inRANGE(RExC_parse[0], '1', '9')) {
11936                     /* (?(1)...) */
11937                     char c;
11938                     UV uv;
11939                     endptr = RExC_end;
11940                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11941                         && uv <= I32_MAX
11942                     ) {
11943                         parno = (I32)uv;
11944                         RExC_parse = (char*)endptr;
11945                     }
11946                     else {
11947                         vFAIL("panic: grok_atoUV returned FALSE");
11948                     }
11949                     ret = reganode(pRExC_state, GROUPP, parno);
11950
11951                  insert_if_check_paren:
11952                     if (UCHARAT(RExC_parse) != ')') {
11953                         RExC_parse += UTF
11954                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11955                                       : 1;
11956                         vFAIL("Switch condition not recognized");
11957                     }
11958                     nextchar(pRExC_state);
11959                   insert_if:
11960                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
11961                                                              IFTHEN, 0)))
11962                     {
11963                         REQUIRE_BRANCHJ(flagp, 0);
11964                     }
11965                     br = regbranch(pRExC_state, &flags, 1, depth+1);
11966                     if (br == 0) {
11967                         RETURN_FAIL_ON_RESTART(flags,flagp);
11968                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11969                               (UV) flags);
11970                     } else
11971                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
11972                                                              LONGJMP, 0)))
11973                     {
11974                         REQUIRE_BRANCHJ(flagp, 0);
11975                     }
11976                     c = UCHARAT(RExC_parse);
11977                     nextchar(pRExC_state);
11978                     if (flags&HASWIDTH)
11979                         *flagp |= HASWIDTH;
11980                     if (c == '|') {
11981                         if (is_define)
11982                             vFAIL("(?(DEFINE)....) does not allow branches");
11983
11984                         /* Fake one for optimizer.  */
11985                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11986
11987                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11988                             RETURN_FAIL_ON_RESTART(flags, flagp);
11989                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11990                                   (UV) flags);
11991                         }
11992                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
11993                             REQUIRE_BRANCHJ(flagp, 0);
11994                         }
11995                         if (flags&HASWIDTH)
11996                             *flagp |= HASWIDTH;
11997                         c = UCHARAT(RExC_parse);
11998                         nextchar(pRExC_state);
11999                     }
12000                     else
12001                         lastbr = 0;
12002                     if (c != ')') {
12003                         if (RExC_parse >= RExC_end)
12004                             vFAIL("Switch (?(condition)... not terminated");
12005                         else
12006                             vFAIL("Switch (?(condition)... contains too many branches");
12007                     }
12008                     ender = reg_node(pRExC_state, TAIL);
12009                     if (! REGTAIL(pRExC_state, br, ender)) {
12010                         REQUIRE_BRANCHJ(flagp, 0);
12011                     }
12012                     if (lastbr) {
12013                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12014                             REQUIRE_BRANCHJ(flagp, 0);
12015                         }
12016                         if (! REGTAIL(pRExC_state,
12017                                       REGNODE_OFFSET(
12018                                                  NEXTOPER(
12019                                                  NEXTOPER(REGNODE_p(lastbr)))),
12020                                       ender))
12021                         {
12022                             REQUIRE_BRANCHJ(flagp, 0);
12023                         }
12024                     }
12025                     else
12026                         if (! REGTAIL(pRExC_state, ret, ender)) {
12027                             REQUIRE_BRANCHJ(flagp, 0);
12028                         }
12029 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
12030                     RExC_size++; /* XXX WHY do we need this?!!
12031                                     For large programs it seems to be required
12032                                     but I can't figure out why. -- dmq*/
12033 #endif
12034                     return ret;
12035                 }
12036                 RExC_parse += UTF
12037                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12038                               : 1;
12039                 vFAIL("Unknown switch condition (?(...))");
12040             }
12041             case '[':           /* (?[ ... ]) */
12042                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12043                                          oregcomp_parse);
12044             case 0: /* A NUL */
12045                 RExC_parse--; /* for vFAIL to print correctly */
12046                 vFAIL("Sequence (? incomplete");
12047                 break;
12048
12049             case ')':
12050                 if (RExC_strict) {  /* [perl #132851] */
12051                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12052                 }
12053                 /* FALLTHROUGH */
12054             default: /* e.g., (?i) */
12055                 RExC_parse = (char *) seqstart + 1;
12056               parse_flags:
12057                 parse_lparen_question_flags(pRExC_state);
12058                 if (UCHARAT(RExC_parse) != ':') {
12059                     if (RExC_parse < RExC_end)
12060                         nextchar(pRExC_state);
12061                     *flagp = TRYAGAIN;
12062                     return 0;
12063                 }
12064                 paren = ':';
12065                 nextchar(pRExC_state);
12066                 ret = 0;
12067                 goto parse_rest;
12068             } /* end switch */
12069         }
12070         else {
12071             if (*RExC_parse == '{') {
12072                 ckWARNregdep(RExC_parse + 1,
12073                             "Unescaped left brace in regex is "
12074                             "deprecated here (and will be fatal "
12075                             "in Perl 5.32), passed through");
12076             }
12077             /* Not bothering to indent here, as the above 'else' is temporary
12078              * */
12079         if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12080           capturing_parens:
12081             parno = RExC_npar;
12082             RExC_npar++;
12083             if (! ALL_PARENS_COUNTED) {
12084                 /* If we are in our first pass through (and maybe only pass),
12085                  * we  need to allocate memory for the capturing parentheses
12086                  * data structures.
12087                  */
12088
12089                 if (!RExC_parens_buf_size) {
12090                     /* first guess at number of parens we might encounter */
12091                     RExC_parens_buf_size = 10;
12092
12093                     /* setup RExC_open_parens, which holds the address of each
12094                      * OPEN tag, and to make things simpler for the 0 index the
12095                      * start of the program - this is used later for offsets */
12096                     Newxz(RExC_open_parens, RExC_parens_buf_size,
12097                             regnode_offset);
12098                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12099
12100                     /* setup RExC_close_parens, which holds the address of each
12101                      * CLOSE tag, and to make things simpler for the 0 index
12102                      * the end of the program - this is used later for offsets
12103                      * */
12104                     Newxz(RExC_close_parens, RExC_parens_buf_size,
12105                             regnode_offset);
12106                     /* we dont know where end op starts yet, so we dont need to
12107                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12108                      * above */
12109                 }
12110                 else if (RExC_npar > RExC_parens_buf_size) {
12111                     I32 old_size = RExC_parens_buf_size;
12112
12113                     RExC_parens_buf_size *= 2;
12114
12115                     Renew(RExC_open_parens, RExC_parens_buf_size,
12116                             regnode_offset);
12117                     Zero(RExC_open_parens + old_size,
12118                             RExC_parens_buf_size - old_size, regnode_offset);
12119
12120                     Renew(RExC_close_parens, RExC_parens_buf_size,
12121                             regnode_offset);
12122                     Zero(RExC_close_parens + old_size,
12123                             RExC_parens_buf_size - old_size, regnode_offset);
12124                 }
12125             }
12126
12127             ret = reganode(pRExC_state, OPEN, parno);
12128             if (!RExC_nestroot)
12129                 RExC_nestroot = parno;
12130             if (RExC_open_parens && !RExC_open_parens[parno])
12131             {
12132                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12133                     "%*s%*s Setting open paren #%" IVdf " to %d\n",
12134                     22, "|    |", (int)(depth * 2 + 1), "",
12135                     (IV)parno, ret));
12136                 RExC_open_parens[parno]= ret;
12137             }
12138
12139             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12140             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12141             is_open = 1;
12142         } else {
12143             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12144             paren = ':';
12145             ret = 0;
12146         }
12147         }
12148     }
12149     else                        /* ! paren */
12150         ret = 0;
12151
12152    parse_rest:
12153     /* Pick up the branches, linking them together. */
12154     parse_start = RExC_parse;   /* MJD */
12155     br = regbranch(pRExC_state, &flags, 1, depth+1);
12156
12157     /*     branch_len = (paren != 0); */
12158
12159     if (br == 0) {
12160         RETURN_FAIL_ON_RESTART(flags, flagp);
12161         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12162     }
12163     if (*RExC_parse == '|') {
12164         if (RExC_use_BRANCHJ) {
12165             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12166         }
12167         else {                  /* MJD */
12168             reginsert(pRExC_state, BRANCH, br, depth+1);
12169             Set_Node_Length(REGNODE_p(br), paren != 0);
12170             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12171         }
12172         have_branch = 1;
12173     }
12174     else if (paren == ':') {
12175         *flagp |= flags&SIMPLE;
12176     }
12177     if (is_open) {                              /* Starts with OPEN. */
12178         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12179             REQUIRE_BRANCHJ(flagp, 0);
12180         }
12181     }
12182     else if (paren != '?')              /* Not Conditional */
12183         ret = br;
12184     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12185     lastbr = br;
12186     while (*RExC_parse == '|') {
12187         if (RExC_use_BRANCHJ) {
12188             bool shut_gcc_up;
12189
12190             ender = reganode(pRExC_state, LONGJMP, 0);
12191
12192             /* Append to the previous. */
12193             shut_gcc_up = REGTAIL(pRExC_state,
12194                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12195                          ender);
12196             PERL_UNUSED_VAR(shut_gcc_up);
12197         }
12198         nextchar(pRExC_state);
12199         if (freeze_paren) {
12200             if (RExC_npar > after_freeze)
12201                 after_freeze = RExC_npar;
12202             RExC_npar = freeze_paren;
12203         }
12204         br = regbranch(pRExC_state, &flags, 0, depth+1);
12205
12206         if (br == 0) {
12207             RETURN_FAIL_ON_RESTART(flags, flagp);
12208             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12209         }
12210         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12211             REQUIRE_BRANCHJ(flagp, 0);
12212         }
12213         lastbr = br;
12214         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12215     }
12216
12217     if (have_branch || paren != ':') {
12218         regnode * br;
12219
12220         /* Make a closing node, and hook it on the end. */
12221         switch (paren) {
12222         case ':':
12223             ender = reg_node(pRExC_state, TAIL);
12224             break;
12225         case 1: case 2:
12226             ender = reganode(pRExC_state, CLOSE, parno);
12227             if ( RExC_close_parens ) {
12228                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12229                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
12230                         22, "|    |", (int)(depth * 2 + 1), "",
12231                         (IV)parno, ender));
12232                 RExC_close_parens[parno]= ender;
12233                 if (RExC_nestroot == parno)
12234                     RExC_nestroot = 0;
12235             }
12236             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12237             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12238             break;
12239         case 's':
12240             ender = reg_node(pRExC_state, SRCLOSE);
12241             RExC_in_script_run = 0;
12242             break;
12243         case '<':
12244         case 'a':
12245         case 'A':
12246         case 'b':
12247         case 'B':
12248         case ',':
12249         case '=':
12250         case '!':
12251             *flagp &= ~HASWIDTH;
12252             /* FALLTHROUGH */
12253         case 't':   /* aTomic */
12254         case '>':
12255             ender = reg_node(pRExC_state, SUCCEED);
12256             break;
12257         case 0:
12258             ender = reg_node(pRExC_state, END);
12259             assert(!RExC_end_op); /* there can only be one! */
12260             RExC_end_op = REGNODE_p(ender);
12261             if (RExC_close_parens) {
12262                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12263                     "%*s%*s Setting close paren #0 (END) to %d\n",
12264                     22, "|    |", (int)(depth * 2 + 1), "",
12265                     ender));
12266
12267                 RExC_close_parens[0]= ender;
12268             }
12269             break;
12270         }
12271         DEBUG_PARSE_r(
12272             DEBUG_PARSE_MSG("lsbr");
12273             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12274             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12275             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12276                           SvPV_nolen_const(RExC_mysv1),
12277                           (IV)lastbr,
12278                           SvPV_nolen_const(RExC_mysv2),
12279                           (IV)ender,
12280                           (IV)(ender - lastbr)
12281             );
12282         );
12283         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12284             REQUIRE_BRANCHJ(flagp, 0);
12285         }
12286
12287         if (have_branch) {
12288             char is_nothing= 1;
12289             if (depth==1)
12290                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12291
12292             /* Hook the tails of the branches to the closing node. */
12293             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12294                 const U8 op = PL_regkind[OP(br)];
12295                 if (op == BRANCH) {
12296                     if (! REGTAIL_STUDY(pRExC_state,
12297                                         REGNODE_OFFSET(NEXTOPER(br)),
12298                                         ender))
12299                     {
12300                         REQUIRE_BRANCHJ(flagp, 0);
12301                     }
12302                     if ( OP(NEXTOPER(br)) != NOTHING
12303                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12304                         is_nothing= 0;
12305                 }
12306                 else if (op == BRANCHJ) {
12307                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12308                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12309                                         ender);
12310                     PERL_UNUSED_VAR(shut_gcc_up);
12311                     /* for now we always disable this optimisation * /
12312                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12313                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12314                     */
12315                         is_nothing= 0;
12316                 }
12317             }
12318             if (is_nothing) {
12319                 regnode * ret_as_regnode = REGNODE_p(ret);
12320                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12321                                ? regnext(ret_as_regnode)
12322                                : ret_as_regnode;
12323                 DEBUG_PARSE_r(
12324                     DEBUG_PARSE_MSG("NADA");
12325                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12326                                      NULL, pRExC_state);
12327                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12328                                      NULL, pRExC_state);
12329                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12330                                   SvPV_nolen_const(RExC_mysv1),
12331                                   (IV)REG_NODE_NUM(ret_as_regnode),
12332                                   SvPV_nolen_const(RExC_mysv2),
12333                                   (IV)ender,
12334                                   (IV)(ender - ret)
12335                     );
12336                 );
12337                 OP(br)= NOTHING;
12338                 if (OP(REGNODE_p(ender)) == TAIL) {
12339                     NEXT_OFF(br)= 0;
12340                     RExC_emit= REGNODE_OFFSET(br) + 1;
12341                 } else {
12342                     regnode *opt;
12343                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12344                         OP(opt)= OPTIMIZED;
12345                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12346                 }
12347             }
12348         }
12349     }
12350
12351     {
12352         const char *p;
12353          /* Even/odd or x=don't care: 010101x10x */
12354         static const char parens[] = "=!aA<,>Bbt";
12355          /* flag below is set to 0 up through 'A'; 1 for larger */
12356
12357         if (paren && (p = strchr(parens, paren))) {
12358             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12359             int flag = (p - parens) > 3;
12360
12361             if (paren == '>' || paren == 't') {
12362                 node = SUSPEND, flag = 0;
12363             }
12364
12365             reginsert(pRExC_state, node, ret, depth+1);
12366             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12367             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12368             FLAGS(REGNODE_p(ret)) = flag;
12369             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12370             {
12371                 REQUIRE_BRANCHJ(flagp, 0);
12372             }
12373         }
12374     }
12375
12376     /* Check for proper termination. */
12377     if (paren) {
12378         /* restore original flags, but keep (?p) and, if we've encountered
12379          * something in the parse that changes /d rules into /u, keep the /u */
12380         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12381         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12382             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12383         }
12384         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12385             RExC_parse = oregcomp_parse;
12386             vFAIL("Unmatched (");
12387         }
12388         nextchar(pRExC_state);
12389     }
12390     else if (!paren && RExC_parse < RExC_end) {
12391         if (*RExC_parse == ')') {
12392             RExC_parse++;
12393             vFAIL("Unmatched )");
12394         }
12395         else
12396             FAIL("Junk on end of regexp");      /* "Can't happen". */
12397         NOT_REACHED; /* NOTREACHED */
12398     }
12399
12400     if (RExC_in_lookbehind) {
12401         RExC_in_lookbehind--;
12402     }
12403     if (RExC_in_lookahead) {
12404         RExC_in_lookahead--;
12405     }
12406     if (after_freeze > RExC_npar)
12407         RExC_npar = after_freeze;
12408     return(ret);
12409 }
12410
12411 /*
12412  - regbranch - one alternative of an | operator
12413  *
12414  * Implements the concatenation operator.
12415  *
12416  * On success, returns the offset at which any next node should be placed into
12417  * the regex engine program being compiled.
12418  *
12419  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12420  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12421  * UTF-8
12422  */
12423 STATIC regnode_offset
12424 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12425 {
12426     regnode_offset ret;
12427     regnode_offset chain = 0;
12428     regnode_offset latest;
12429     I32 flags = 0, c = 0;
12430     GET_RE_DEBUG_FLAGS_DECL;
12431
12432     PERL_ARGS_ASSERT_REGBRANCH;
12433
12434     DEBUG_PARSE("brnc");
12435
12436     if (first)
12437         ret = 0;
12438     else {
12439         if (RExC_use_BRANCHJ)
12440             ret = reganode(pRExC_state, BRANCHJ, 0);
12441         else {
12442             ret = reg_node(pRExC_state, BRANCH);
12443             Set_Node_Length(REGNODE_p(ret), 1);
12444         }
12445     }
12446
12447     *flagp = WORST;                     /* Tentatively. */
12448
12449     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12450                             FALSE /* Don't force to /x */ );
12451     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12452         flags &= ~TRYAGAIN;
12453         latest = regpiece(pRExC_state, &flags, depth+1);
12454         if (latest == 0) {
12455             if (flags & TRYAGAIN)
12456                 continue;
12457             RETURN_FAIL_ON_RESTART(flags, flagp);
12458             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12459         }
12460         else if (ret == 0)
12461             ret = latest;
12462         *flagp |= flags&(HASWIDTH|POSTPONED);
12463         if (chain == 0)         /* First piece. */
12464             *flagp |= flags&SPSTART;
12465         else {
12466             /* FIXME adding one for every branch after the first is probably
12467              * excessive now we have TRIE support. (hv) */
12468             MARK_NAUGHTY(1);
12469             if (! REGTAIL(pRExC_state, chain, latest)) {
12470                 /* XXX We could just redo this branch, but figuring out what
12471                  * bookkeeping needs to be reset is a pain, and it's likely
12472                  * that other branches that goto END will also be too large */
12473                 REQUIRE_BRANCHJ(flagp, 0);
12474             }
12475         }
12476         chain = latest;
12477         c++;
12478     }
12479     if (chain == 0) {   /* Loop ran zero times. */
12480         chain = reg_node(pRExC_state, NOTHING);
12481         if (ret == 0)
12482             ret = chain;
12483     }
12484     if (c == 1) {
12485         *flagp |= flags&SIMPLE;
12486     }
12487
12488     return ret;
12489 }
12490
12491 /*
12492  - regpiece - something followed by possible quantifier * + ? {n,m}
12493  *
12494  * Note that the branching code sequences used for ? and the general cases
12495  * of * and + are somewhat optimized:  they use the same NOTHING node as
12496  * both the endmarker for their branch list and the body of the last branch.
12497  * It might seem that this node could be dispensed with entirely, but the
12498  * endmarker role is not redundant.
12499  *
12500  * On success, returns the offset at which any next node should be placed into
12501  * the regex engine program being compiled.
12502  *
12503  * Returns 0 otherwise, with *flagp set to indicate why:
12504  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12505  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12506  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12507  */
12508 STATIC regnode_offset
12509 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12510 {
12511     regnode_offset ret;
12512     char op;
12513     char *next;
12514     I32 flags;
12515     const char * const origparse = RExC_parse;
12516     I32 min;
12517     I32 max = REG_INFTY;
12518 #ifdef RE_TRACK_PATTERN_OFFSETS
12519     char *parse_start;
12520 #endif
12521     const char *maxpos = NULL;
12522     UV uv;
12523
12524     /* Save the original in case we change the emitted regop to a FAIL. */
12525     const regnode_offset orig_emit = RExC_emit;
12526
12527     GET_RE_DEBUG_FLAGS_DECL;
12528
12529     PERL_ARGS_ASSERT_REGPIECE;
12530
12531     DEBUG_PARSE("piec");
12532
12533     ret = regatom(pRExC_state, &flags, depth+1);
12534     if (ret == 0) {
12535         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12536         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12537     }
12538
12539     op = *RExC_parse;
12540
12541     if (op == '{' && regcurly(RExC_parse)) {
12542         maxpos = NULL;
12543 #ifdef RE_TRACK_PATTERN_OFFSETS
12544         parse_start = RExC_parse; /* MJD */
12545 #endif
12546         next = RExC_parse + 1;
12547         while (isDIGIT(*next) || *next == ',') {
12548             if (*next == ',') {
12549                 if (maxpos)
12550                     break;
12551                 else
12552                     maxpos = next;
12553             }
12554             next++;
12555         }
12556         if (*next == '}') {             /* got one */
12557             const char* endptr;
12558             if (!maxpos)
12559                 maxpos = next;
12560             RExC_parse++;
12561             if (isDIGIT(*RExC_parse)) {
12562                 endptr = RExC_end;
12563                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12564                     vFAIL("Invalid quantifier in {,}");
12565                 if (uv >= REG_INFTY)
12566                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12567                 min = (I32)uv;
12568             } else {
12569                 min = 0;
12570             }
12571             if (*maxpos == ',')
12572                 maxpos++;
12573             else
12574                 maxpos = RExC_parse;
12575             if (isDIGIT(*maxpos)) {
12576                 endptr = RExC_end;
12577                 if (!grok_atoUV(maxpos, &uv, &endptr))
12578                     vFAIL("Invalid quantifier in {,}");
12579                 if (uv >= REG_INFTY)
12580                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12581                 max = (I32)uv;
12582             } else {
12583                 max = REG_INFTY;                /* meaning "infinity" */
12584             }
12585             RExC_parse = next;
12586             nextchar(pRExC_state);
12587             if (max < min) {    /* If can't match, warn and optimize to fail
12588                                    unconditionally */
12589                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12590                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12591                 NEXT_OFF(REGNODE_p(orig_emit)) =
12592                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12593                 return ret;
12594             }
12595             else if (min == max && *RExC_parse == '?')
12596             {
12597                 ckWARN2reg(RExC_parse + 1,
12598                            "Useless use of greediness modifier '%c'",
12599                            *RExC_parse);
12600             }
12601
12602           do_curly:
12603             if ((flags&SIMPLE)) {
12604                 if (min == 0 && max == REG_INFTY) {
12605                     reginsert(pRExC_state, STAR, ret, depth+1);
12606                     MARK_NAUGHTY(4);
12607                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12608                     goto nest_check;
12609                 }
12610                 if (min == 1 && max == REG_INFTY) {
12611                     reginsert(pRExC_state, PLUS, ret, depth+1);
12612                     MARK_NAUGHTY(3);
12613                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12614                     goto nest_check;
12615                 }
12616                 MARK_NAUGHTY_EXP(2, 2);
12617                 reginsert(pRExC_state, CURLY, ret, depth+1);
12618                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12619                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12620             }
12621             else {
12622                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12623
12624                 FLAGS(REGNODE_p(w)) = 0;
12625                 if (!  REGTAIL(pRExC_state, ret, w)) {
12626                     REQUIRE_BRANCHJ(flagp, 0);
12627                 }
12628                 if (RExC_use_BRANCHJ) {
12629                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
12630                     reginsert(pRExC_state, NOTHING, ret, depth+1);
12631                     NEXT_OFF(REGNODE_p(ret)) = 3;       /* Go over LONGJMP. */
12632                 }
12633                 reginsert(pRExC_state, CURLYX, ret, depth+1);
12634                                 /* MJD hk */
12635                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12636                 Set_Node_Length(REGNODE_p(ret),
12637                                 op == '{' ? (RExC_parse - parse_start) : 1);
12638
12639                 if (RExC_use_BRANCHJ)
12640                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12641                                                        LONGJMP. */
12642                 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12643                                                           NOTHING)))
12644                 {
12645                     REQUIRE_BRANCHJ(flagp, 0);
12646                 }
12647                 RExC_whilem_seen++;
12648                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12649             }
12650             FLAGS(REGNODE_p(ret)) = 0;
12651
12652             if (min > 0)
12653                 *flagp = WORST;
12654             if (max > 0)
12655                 *flagp |= HASWIDTH;
12656             ARG1_SET(REGNODE_p(ret), (U16)min);
12657             ARG2_SET(REGNODE_p(ret), (U16)max);
12658             if (max == REG_INFTY)
12659                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12660
12661             goto nest_check;
12662         }
12663     }
12664
12665     if (!ISMULT1(op)) {
12666         *flagp = flags;
12667         return(ret);
12668     }
12669
12670 #if 0                           /* Now runtime fix should be reliable. */
12671
12672     /* if this is reinstated, don't forget to put this back into perldiag:
12673
12674             =item Regexp *+ operand could be empty at {#} in regex m/%s/
12675
12676            (F) The part of the regexp subject to either the * or + quantifier
12677            could match an empty string. The {#} shows in the regular
12678            expression about where the problem was discovered.
12679
12680     */
12681
12682     if (!(flags&HASWIDTH) && op != '?')
12683       vFAIL("Regexp *+ operand could be empty");
12684 #endif
12685
12686 #ifdef RE_TRACK_PATTERN_OFFSETS
12687     parse_start = RExC_parse;
12688 #endif
12689     nextchar(pRExC_state);
12690
12691     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12692
12693     if (op == '*') {
12694         min = 0;
12695         goto do_curly;
12696     }
12697     else if (op == '+') {
12698         min = 1;
12699         goto do_curly;
12700     }
12701     else if (op == '?') {
12702         min = 0; max = 1;
12703         goto do_curly;
12704     }
12705   nest_check:
12706     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12707         ckWARN2reg(RExC_parse,
12708                    "%" UTF8f " matches null string many times",
12709                    UTF8fARG(UTF, (RExC_parse >= origparse
12710                                  ? RExC_parse - origparse
12711                                  : 0),
12712                    origparse));
12713     }
12714
12715     if (*RExC_parse == '?') {
12716         nextchar(pRExC_state);
12717         reginsert(pRExC_state, MINMOD, ret, depth+1);
12718         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12719             REQUIRE_BRANCHJ(flagp, 0);
12720         }
12721     }
12722     else if (*RExC_parse == '+') {
12723         regnode_offset ender;
12724         nextchar(pRExC_state);
12725         ender = reg_node(pRExC_state, SUCCEED);
12726         if (! REGTAIL(pRExC_state, ret, ender)) {
12727             REQUIRE_BRANCHJ(flagp, 0);
12728         }
12729         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12730         ender = reg_node(pRExC_state, TAIL);
12731         if (! REGTAIL(pRExC_state, ret, ender)) {
12732             REQUIRE_BRANCHJ(flagp, 0);
12733         }
12734     }
12735
12736     if (ISMULT2(RExC_parse)) {
12737         RExC_parse++;
12738         vFAIL("Nested quantifiers");
12739     }
12740
12741     return(ret);
12742 }
12743
12744 STATIC bool
12745 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12746                 regnode_offset * node_p,
12747                 UV * code_point_p,
12748                 int * cp_count,
12749                 I32 * flagp,
12750                 const bool strict,
12751                 const U32 depth
12752     )
12753 {
12754  /* This routine teases apart the various meanings of \N and returns
12755   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12756   * in the current context.
12757   *
12758   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12759   *
12760   * If <code_point_p> is not NULL, the context is expecting the result to be a
12761   * single code point.  If this \N instance turns out to a single code point,
12762   * the function returns TRUE and sets *code_point_p to that code point.
12763   *
12764   * If <node_p> is not NULL, the context is expecting the result to be one of
12765   * the things representable by a regnode.  If this \N instance turns out to be
12766   * one such, the function generates the regnode, returns TRUE and sets *node_p
12767   * to point to the offset of that regnode into the regex engine program being
12768   * compiled.
12769   *
12770   * If this instance of \N isn't legal in any context, this function will
12771   * generate a fatal error and not return.
12772   *
12773   * On input, RExC_parse should point to the first char following the \N at the
12774   * time of the call.  On successful return, RExC_parse will have been updated
12775   * to point to just after the sequence identified by this routine.  Also
12776   * *flagp has been updated as needed.
12777   *
12778   * When there is some problem with the current context and this \N instance,
12779   * the function returns FALSE, without advancing RExC_parse, nor setting
12780   * *node_p, nor *code_point_p, nor *flagp.
12781   *
12782   * If <cp_count> is not NULL, the caller wants to know the length (in code
12783   * points) that this \N sequence matches.  This is set, and the input is
12784   * parsed for errors, even if the function returns FALSE, as detailed below.
12785   *
12786   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12787   *
12788   * Probably the most common case is for the \N to specify a single code point.
12789   * *cp_count will be set to 1, and *code_point_p will be set to that code
12790   * point.
12791   *
12792   * Another possibility is for the input to be an empty \N{}.  This is no
12793   * longer accepted, and will generate a fatal error.
12794   *
12795   * Another possibility is for a custom charnames handler to be in effect which
12796   * translates the input name to an empty string.  *cp_count will be set to 0.
12797   * *node_p will be set to a generated NOTHING node.
12798   *
12799   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12800   * set to 0. *node_p will be set to a generated REG_ANY node.
12801   *
12802   * The fifth possibility is that \N resolves to a sequence of more than one
12803   * code points.  *cp_count will be set to the number of code points in the
12804   * sequence. *node_p will be set to a generated node returned by this
12805   * function calling S_reg().
12806   *
12807   * The final possibility is that it is premature to be calling this function;
12808   * the parse needs to be restarted.  This can happen when this changes from
12809   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12810   * latter occurs only when the fifth possibility would otherwise be in
12811   * effect, and is because one of those code points requires the pattern to be
12812   * recompiled as UTF-8.  The function returns FALSE, and sets the
12813   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12814   * happens, the caller needs to desist from continuing parsing, and return
12815   * this information to its caller.  This is not set for when there is only one
12816   * code point, as this can be called as part of an ANYOF node, and they can
12817   * store above-Latin1 code points without the pattern having to be in UTF-8.
12818   *
12819   * For non-single-quoted regexes, the tokenizer has resolved character and
12820   * sequence names inside \N{...} into their Unicode values, normalizing the
12821   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12822   * hex-represented code points in the sequence.  This is done there because
12823   * the names can vary based on what charnames pragma is in scope at the time,
12824   * so we need a way to take a snapshot of what they resolve to at the time of
12825   * the original parse. [perl #56444].
12826   *
12827   * That parsing is skipped for single-quoted regexes, so here we may get
12828   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12829   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12830   * the native character set for non-ASCII platforms.  The other possibilities
12831   * are already native, so no translation is done. */
12832
12833     char * endbrace;    /* points to '}' following the name */
12834     char* p = RExC_parse; /* Temporary */
12835
12836     SV * substitute_parse = NULL;
12837     char *orig_end;
12838     char *save_start;
12839     I32 flags;
12840
12841     GET_RE_DEBUG_FLAGS_DECL;
12842
12843     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12844
12845     GET_RE_DEBUG_FLAGS;
12846
12847     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12848     assert(! (node_p && cp_count));               /* At most 1 should be set */
12849
12850     if (cp_count) {     /* Initialize return for the most common case */
12851         *cp_count = 1;
12852     }
12853
12854     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12855      * modifier.  The other meanings do not, so use a temporary until we find
12856      * out which we are being called with */
12857     skip_to_be_ignored_text(pRExC_state, &p,
12858                             FALSE /* Don't force to /x */ );
12859
12860     /* Disambiguate between \N meaning a named character versus \N meaning
12861      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12862      * quantifier, or if there is no '{' at all */
12863     if (*p != '{' || regcurly(p)) {
12864         RExC_parse = p;
12865         if (cp_count) {
12866             *cp_count = -1;
12867         }
12868
12869         if (! node_p) {
12870             return FALSE;
12871         }
12872
12873         *node_p = reg_node(pRExC_state, REG_ANY);
12874         *flagp |= HASWIDTH|SIMPLE;
12875         MARK_NAUGHTY(1);
12876         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12877         return TRUE;
12878     }
12879
12880     /* The test above made sure that the next real character is a '{', but
12881      * under the /x modifier, it could be separated by space (or a comment and
12882      * \n) and this is not allowed (for consistency with \x{...} and the
12883      * tokenizer handling of \N{NAME}). */
12884     if (*RExC_parse != '{') {
12885         vFAIL("Missing braces on \\N{}");
12886     }
12887
12888     RExC_parse++;       /* Skip past the '{' */
12889
12890     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12891     if (! endbrace) { /* no trailing brace */
12892         vFAIL2("Missing right brace on \\%c{}", 'N');
12893     }
12894
12895     /* Here, we have decided it should be a named character or sequence.  These
12896      * imply Unicode semantics */
12897     REQUIRE_UNI_RULES(flagp, FALSE);
12898
12899     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12900      * nothing at all (not allowed under strict) */
12901     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12902         RExC_parse = endbrace;
12903         if (strict) {
12904             RExC_parse++;   /* Position after the "}" */
12905             vFAIL("Zero length \\N{}");
12906         }
12907
12908         if (cp_count) {
12909             *cp_count = 0;
12910         }
12911         nextchar(pRExC_state);
12912         if (! node_p) {
12913             return FALSE;
12914         }
12915
12916         *node_p = reg_node(pRExC_state, NOTHING);
12917         return TRUE;
12918     }
12919
12920     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
12921
12922         /* Here, the name isn't of the form  U+....  This can happen if the
12923          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
12924          * is the time to find out what the name means */
12925
12926         const STRLEN name_len = endbrace - RExC_parse;
12927         SV *  value_sv;     /* What does this name evaluate to */
12928         SV ** value_svp;
12929         const U8 * value;   /* string of name's value */
12930         STRLEN value_len;   /* and its length */
12931
12932         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
12933          *  toke.c, and their values. Make sure is initialized */
12934         if (! RExC_unlexed_names) {
12935             RExC_unlexed_names = newHV();
12936         }
12937
12938         /* If we have already seen this name in this pattern, use that.  This
12939          * allows us to only call the charnames handler once per name per
12940          * pattern.  A broken or malicious handler could return something
12941          * different each time, which could cause the results to vary depending
12942          * on if something gets added or subtracted from the pattern that
12943          * causes the number of passes to change, for example */
12944         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
12945                                                       name_len, 0)))
12946         {
12947             value_sv = *value_svp;
12948         }
12949         else { /* Otherwise we have to go out and get the name */
12950             const char * error_msg = NULL;
12951             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
12952                                                       UTF,
12953                                                       &error_msg);
12954             if (error_msg) {
12955                 RExC_parse = endbrace;
12956                 vFAIL(error_msg);
12957             }
12958
12959             /* If no error message, should have gotten a valid return */
12960             assert (value_sv);
12961
12962             /* Save the name's meaning for later use */
12963             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
12964                            value_sv, 0))
12965             {
12966                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
12967             }
12968         }
12969
12970         /* Here, we have the value the name evaluates to in 'value_sv' */
12971         value = (U8 *) SvPV(value_sv, value_len);
12972
12973         /* See if the result is one code point vs 0 or multiple */
12974         if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
12975                                                ? UTF8SKIP(value)
12976                                                : 1))
12977         {
12978             /* Here, exactly one code point.  If that isn't what is wanted,
12979              * fail */
12980             if (! code_point_p) {
12981                 RExC_parse = p;
12982                 return FALSE;
12983             }
12984
12985             /* Convert from string to numeric code point */
12986             *code_point_p = (SvUTF8(value_sv))
12987                             ? valid_utf8_to_uvchr(value, NULL)
12988                             : *value;
12989
12990             /* Have parsed this entire single code point \N{...}.  *cp_count
12991              * has already been set to 1, so don't do it again. */
12992             RExC_parse = endbrace;
12993             nextchar(pRExC_state);
12994             return TRUE;
12995         } /* End of is a single code point */
12996
12997         /* Count the code points, if caller desires.  The API says to do this
12998          * even if we will later return FALSE */
12999         if (cp_count) {
13000             *cp_count = 0;
13001
13002             *cp_count = (SvUTF8(value_sv))
13003                         ? utf8_length(value, value + value_len)
13004                         : value_len;
13005         }
13006
13007         /* Fail if caller doesn't want to handle a multi-code-point sequence.
13008          * But don't back the pointer up if the caller wants to know how many
13009          * code points there are (they need to handle it themselves in this
13010          * case).  */
13011         if (! node_p) {
13012             if (! cp_count) {
13013                 RExC_parse = p;
13014             }
13015             return FALSE;
13016         }
13017
13018         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13019          * reg recursively to parse it.  That way, it retains its atomicness,
13020          * while not having to worry about any special handling that some code
13021          * points may have. */
13022
13023         substitute_parse = newSVpvs("?:");
13024         sv_catsv(substitute_parse, value_sv);
13025         sv_catpv(substitute_parse, ")");
13026
13027         /* The value should already be native, so no need to convert on EBCDIC
13028          * platforms.*/
13029         assert(! RExC_recode_x_to_native);
13030
13031     }
13032     else {   /* \N{U+...} */
13033         Size_t count = 0;   /* code point count kept internally */
13034
13035         /* We can get to here when the input is \N{U+...} or when toke.c has
13036          * converted a name to the \N{U+...} form.  This include changing a
13037          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13038
13039         RExC_parse += 2;    /* Skip past the 'U+' */
13040
13041         /* Code points are separated by dots.  The '}' terminates the whole
13042          * thing. */
13043
13044         do {    /* Loop until the ending brace */
13045             UV cp = 0;
13046             char * start_digit;     /* The first of the current code point */
13047             if (! isXDIGIT(*RExC_parse)) {
13048                 RExC_parse++;
13049                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13050             }
13051
13052             start_digit = RExC_parse;
13053             count++;
13054
13055             /* Loop through the hex digits of the current code point */
13056             do {
13057                 /* Adding this digit will shift the result 4 bits.  If that
13058                  * result would be above the legal max, it's overflow */
13059                 if (cp > MAX_LEGAL_CP >> 4) {
13060
13061                     /* Find the end of the code point */
13062                     do {
13063                         RExC_parse ++;
13064                     } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
13065
13066                     /* Be sure to synchronize this message with the similar one
13067                      * in utf8.c */
13068                     vFAIL4("Use of code point 0x%.*s is not allowed; the"
13069                         " permissible max is 0x%" UVxf,
13070                         (int) (RExC_parse - start_digit), start_digit,
13071                         MAX_LEGAL_CP);
13072                 }
13073
13074                 /* Accumulate this (valid) digit into the running total */
13075                 cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
13076
13077                 /* READ_XDIGIT advanced the input pointer.  Ignore a single
13078                  * underscore separator */
13079                 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
13080                     RExC_parse++;
13081                 }
13082             } while (isXDIGIT(*RExC_parse));
13083
13084             /* Here, have accumulated the next code point */
13085             if (RExC_parse >= endbrace) {   /* If done ... */
13086                 if (count != 1) {
13087                     goto do_concat;
13088                 }
13089
13090                 /* Here, is a single code point; fail if doesn't want that */
13091                 if (! code_point_p) {
13092                     RExC_parse = p;
13093                     return FALSE;
13094                 }
13095
13096                 /* A single code point is easy to handle; just return it */
13097                 *code_point_p = UNI_TO_NATIVE(cp);
13098                 RExC_parse = endbrace;
13099                 nextchar(pRExC_state);
13100                 return TRUE;
13101             }
13102
13103             /* Here, the only legal thing would be a multiple character
13104              * sequence (of the form "\N{U+c1.c2. ... }".   So the next
13105              * character must be a dot (and the one after that can't be the
13106              * endbrace, or we'd have something like \N{U+100.} ) */
13107             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13108                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13109                                 ? UTF8SKIP(RExC_parse)
13110                                 : 1;
13111                 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
13112                     RExC_parse = endbrace;
13113                 }
13114                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13115             }
13116
13117             /* Here, looks like its really a multiple character sequence.  Fail
13118              * if that's not what the caller wants.  But continue with counting
13119              * and error checking if they still want a count */
13120             if (! node_p && ! cp_count) {
13121                 return FALSE;
13122             }
13123
13124             /* What is done here is to convert this to a sub-pattern of the
13125              * form \x{char1}\x{char2}...  and then call reg recursively to
13126              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13127              * atomicness, while not having to worry about special handling
13128              * that some code points may have.  We don't create a subpattern,
13129              * but go through the motions of code point counting and error
13130              * checking, if the caller doesn't want a node returned. */
13131
13132             if (node_p && count == 1) {
13133                 substitute_parse = newSVpvs("?:");
13134             }
13135
13136           do_concat:
13137
13138             if (node_p) {
13139                 /* Convert to notation the rest of the code understands */
13140                 sv_catpvs(substitute_parse, "\\x{");
13141                 sv_catpvn(substitute_parse, start_digit,
13142                                             RExC_parse - start_digit);
13143                 sv_catpvs(substitute_parse, "}");
13144             }
13145
13146             /* Move to after the dot (or ending brace the final time through.)
13147              * */
13148             RExC_parse++;
13149             count++;
13150
13151         } while (RExC_parse < endbrace);
13152
13153         if (! node_p) { /* Doesn't want the node */
13154             assert (cp_count);
13155
13156             *cp_count = count;
13157             return FALSE;
13158         }
13159
13160         sv_catpvs(substitute_parse, ")");
13161
13162         /* The values are Unicode, and therefore have to be converted to native
13163          * on a non-Unicode (meaning non-ASCII) platform. */
13164         SET_recode_x_to_native(1);
13165     }
13166
13167     /* Here, we have the string the name evaluates to, ready to be parsed,
13168      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13169      * constructs.  This can be called from within a substitute parse already.
13170      * The error reporting mechanism doesn't work for 2 levels of this, but the
13171      * code above has validated this new construct, so there should be no
13172      * errors generated by the below.  And this isn' an exact copy, so the
13173      * mechanism to seamlessly deal with this won't work, so turn off warnings
13174      * during it */
13175     save_start = RExC_start;
13176     orig_end = RExC_end;
13177
13178     RExC_parse = RExC_start = SvPVX(substitute_parse);
13179     RExC_end = RExC_parse + SvCUR(substitute_parse);
13180     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13181
13182     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13183
13184     /* Restore the saved values */
13185     RESTORE_WARNINGS;
13186     RExC_start = save_start;
13187     RExC_parse = endbrace;
13188     RExC_end = orig_end;
13189     SET_recode_x_to_native(0);
13190
13191     SvREFCNT_dec_NN(substitute_parse);
13192
13193     if (! *node_p) {
13194         RETURN_FAIL_ON_RESTART(flags, flagp);
13195         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13196             (UV) flags);
13197     }
13198     *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13199
13200     nextchar(pRExC_state);
13201
13202     return TRUE;
13203 }
13204
13205
13206 PERL_STATIC_INLINE U8
13207 S_compute_EXACTish(RExC_state_t *pRExC_state)
13208 {
13209     U8 op;
13210
13211     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13212
13213     if (! FOLD) {
13214         return (LOC)
13215                 ? EXACTL
13216                 : EXACT;
13217     }
13218
13219     op = get_regex_charset(RExC_flags);
13220     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13221         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13222                  been, so there is no hole */
13223     }
13224
13225     return op + EXACTF;
13226 }
13227
13228 STATIC bool
13229 S_new_regcurly(const char *s, const char *e)
13230 {
13231     /* This is a temporary function designed to match the most lenient form of
13232      * a {m,n} quantifier we ever envision, with either number omitted, and
13233      * spaces anywhere between/before/after them.
13234      *
13235      * If this function fails, then the string it matches is very unlikely to
13236      * ever be considered a valid quantifier, so we can allow the '{' that
13237      * begins it to be considered as a literal */
13238
13239     bool has_min = FALSE;
13240     bool has_max = FALSE;
13241
13242     PERL_ARGS_ASSERT_NEW_REGCURLY;
13243
13244     if (s >= e || *s++ != '{')
13245         return FALSE;
13246
13247     while (s < e && isSPACE(*s)) {
13248         s++;
13249     }
13250     while (s < e && isDIGIT(*s)) {
13251         has_min = TRUE;
13252         s++;
13253     }
13254     while (s < e && isSPACE(*s)) {
13255         s++;
13256     }
13257
13258     if (*s == ',') {
13259         s++;
13260         while (s < e && isSPACE(*s)) {
13261             s++;
13262         }
13263         while (s < e && isDIGIT(*s)) {
13264             has_max = TRUE;
13265             s++;
13266         }
13267         while (s < e && isSPACE(*s)) {
13268             s++;
13269         }
13270     }
13271
13272     return s < e && *s == '}' && (has_min || has_max);
13273 }
13274
13275 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13276  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13277
13278 static I32
13279 S_backref_value(char *p, char *e)
13280 {
13281     const char* endptr = e;
13282     UV val;
13283     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13284         return (I32)val;
13285     return I32_MAX;
13286 }
13287
13288
13289 /*
13290  - regatom - the lowest level
13291
13292    Try to identify anything special at the start of the current parse position.
13293    If there is, then handle it as required. This may involve generating a
13294    single regop, such as for an assertion; or it may involve recursing, such as
13295    to handle a () structure.
13296
13297    If the string doesn't start with something special then we gobble up
13298    as much literal text as we can.  If we encounter a quantifier, we have to
13299    back off the final literal character, as that quantifier applies to just it
13300    and not to the whole string of literals.
13301
13302    Once we have been able to handle whatever type of thing started the
13303    sequence, we return the offset into the regex engine program being compiled
13304    at which any  next regnode should be placed.
13305
13306    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13307    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13308    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13309    Otherwise does not return 0.
13310
13311    Note: we have to be careful with escapes, as they can be both literal
13312    and special, and in the case of \10 and friends, context determines which.
13313
13314    A summary of the code structure is:
13315
13316    switch (first_byte) {
13317         cases for each special:
13318             handle this special;
13319             break;
13320         case '\\':
13321             switch (2nd byte) {
13322                 cases for each unambiguous special:
13323                     handle this special;
13324                     break;
13325                 cases for each ambigous special/literal:
13326                     disambiguate;
13327                     if (special)  handle here
13328                     else goto defchar;
13329                 default: // unambiguously literal:
13330                     goto defchar;
13331             }
13332         default:  // is a literal char
13333             // FALL THROUGH
13334         defchar:
13335             create EXACTish node for literal;
13336             while (more input and node isn't full) {
13337                 switch (input_byte) {
13338                    cases for each special;
13339                        make sure parse pointer is set so that the next call to
13340                            regatom will see this special first
13341                        goto loopdone; // EXACTish node terminated by prev. char
13342                    default:
13343                        append char to EXACTISH node;
13344                 }
13345                 get next input byte;
13346             }
13347         loopdone:
13348    }
13349    return the generated node;
13350
13351    Specifically there are two separate switches for handling
13352    escape sequences, with the one for handling literal escapes requiring
13353    a dummy entry for all of the special escapes that are actually handled
13354    by the other.
13355
13356 */
13357
13358 STATIC regnode_offset
13359 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13360 {
13361     dVAR;
13362     regnode_offset ret = 0;
13363     I32 flags = 0;
13364     char *parse_start;
13365     U8 op;
13366     int invert = 0;
13367     U8 arg;
13368
13369     GET_RE_DEBUG_FLAGS_DECL;
13370
13371     *flagp = WORST;             /* Tentatively. */
13372
13373     DEBUG_PARSE("atom");
13374
13375     PERL_ARGS_ASSERT_REGATOM;
13376
13377   tryagain:
13378     parse_start = RExC_parse;
13379     assert(RExC_parse < RExC_end);
13380     switch ((U8)*RExC_parse) {
13381     case '^':
13382         RExC_seen_zerolen++;
13383         nextchar(pRExC_state);
13384         if (RExC_flags & RXf_PMf_MULTILINE)
13385             ret = reg_node(pRExC_state, MBOL);
13386         else
13387             ret = reg_node(pRExC_state, SBOL);
13388         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13389         break;
13390     case '$':
13391         nextchar(pRExC_state);
13392         if (*RExC_parse)
13393             RExC_seen_zerolen++;
13394         if (RExC_flags & RXf_PMf_MULTILINE)
13395             ret = reg_node(pRExC_state, MEOL);
13396         else
13397             ret = reg_node(pRExC_state, SEOL);
13398         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13399         break;
13400     case '.':
13401         nextchar(pRExC_state);
13402         if (RExC_flags & RXf_PMf_SINGLELINE)
13403             ret = reg_node(pRExC_state, SANY);
13404         else
13405             ret = reg_node(pRExC_state, REG_ANY);
13406         *flagp |= HASWIDTH|SIMPLE;
13407         MARK_NAUGHTY(1);
13408         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13409         break;
13410     case '[':
13411     {
13412         char * const oregcomp_parse = ++RExC_parse;
13413         ret = regclass(pRExC_state, flagp, depth+1,
13414                        FALSE, /* means parse the whole char class */
13415                        TRUE, /* allow multi-char folds */
13416                        FALSE, /* don't silence non-portable warnings. */
13417                        (bool) RExC_strict,
13418                        TRUE, /* Allow an optimized regnode result */
13419                        NULL);
13420         if (ret == 0) {
13421             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13422             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13423                   (UV) *flagp);
13424         }
13425         if (*RExC_parse != ']') {
13426             RExC_parse = oregcomp_parse;
13427             vFAIL("Unmatched [");
13428         }
13429         nextchar(pRExC_state);
13430         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13431         break;
13432     }
13433     case '(':
13434         nextchar(pRExC_state);
13435         ret = reg(pRExC_state, 2, &flags, depth+1);
13436         if (ret == 0) {
13437                 if (flags & TRYAGAIN) {
13438                     if (RExC_parse >= RExC_end) {
13439                          /* Make parent create an empty node if needed. */
13440                         *flagp |= TRYAGAIN;
13441                         return(0);
13442                     }
13443                     goto tryagain;
13444                 }
13445                 RETURN_FAIL_ON_RESTART(flags, flagp);
13446                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13447                                                                  (UV) flags);
13448         }
13449         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13450         break;
13451     case '|':
13452     case ')':
13453         if (flags & TRYAGAIN) {
13454             *flagp |= TRYAGAIN;
13455             return 0;
13456         }
13457         vFAIL("Internal urp");
13458                                 /* Supposed to be caught earlier. */
13459         break;
13460     case '?':
13461     case '+':
13462     case '*':
13463         RExC_parse++;
13464         vFAIL("Quantifier follows nothing");
13465         break;
13466     case '\\':
13467         /* Special Escapes
13468
13469            This switch handles escape sequences that resolve to some kind
13470            of special regop and not to literal text. Escape sequences that
13471            resolve to literal text are handled below in the switch marked
13472            "Literal Escapes".
13473
13474            Every entry in this switch *must* have a corresponding entry
13475            in the literal escape switch. However, the opposite is not
13476            required, as the default for this switch is to jump to the
13477            literal text handling code.
13478         */
13479         RExC_parse++;
13480         switch ((U8)*RExC_parse) {
13481         /* Special Escapes */
13482         case 'A':
13483             RExC_seen_zerolen++;
13484             ret = reg_node(pRExC_state, SBOL);
13485             /* SBOL is shared with /^/ so we set the flags so we can tell
13486              * /\A/ from /^/ in split. */
13487             FLAGS(REGNODE_p(ret)) = 1;
13488             *flagp |= SIMPLE;
13489             goto finish_meta_pat;
13490         case 'G':
13491             ret = reg_node(pRExC_state, GPOS);
13492             RExC_seen |= REG_GPOS_SEEN;
13493             *flagp |= SIMPLE;
13494             goto finish_meta_pat;
13495         case 'K':
13496             if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13497                 RExC_seen_zerolen++;
13498                 ret = reg_node(pRExC_state, KEEPS);
13499                 *flagp |= SIMPLE;
13500                 /* XXX:dmq : disabling in-place substitution seems to
13501                  * be necessary here to avoid cases of memory corruption, as
13502                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13503                  */
13504                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13505                 goto finish_meta_pat;
13506             }
13507             else {
13508                 ++RExC_parse; /* advance past the 'K' */
13509                 vFAIL("\\K not permitted in lookahead/lookbehind");
13510             }
13511         case 'Z':
13512             ret = reg_node(pRExC_state, SEOL);
13513             *flagp |= SIMPLE;
13514             RExC_seen_zerolen++;                /* Do not optimize RE away */
13515             goto finish_meta_pat;
13516         case 'z':
13517             ret = reg_node(pRExC_state, EOS);
13518             *flagp |= SIMPLE;
13519             RExC_seen_zerolen++;                /* Do not optimize RE away */
13520             goto finish_meta_pat;
13521         case 'C':
13522             vFAIL("\\C no longer supported");
13523         case 'X':
13524             ret = reg_node(pRExC_state, CLUMP);
13525             *flagp |= HASWIDTH;
13526             goto finish_meta_pat;
13527
13528         case 'W':
13529             invert = 1;
13530             /* FALLTHROUGH */
13531         case 'w':
13532             arg = ANYOF_WORDCHAR;
13533             goto join_posix;
13534
13535         case 'B':
13536             invert = 1;
13537             /* FALLTHROUGH */
13538         case 'b':
13539           {
13540             U8 flags = 0;
13541             regex_charset charset = get_regex_charset(RExC_flags);
13542
13543             RExC_seen_zerolen++;
13544             RExC_seen |= REG_LOOKBEHIND_SEEN;
13545             op = BOUND + charset;
13546
13547             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13548                 flags = TRADITIONAL_BOUND;
13549                 if (op > BOUNDA) {  /* /aa is same as /a */
13550                     op = BOUNDA;
13551                 }
13552             }
13553             else {
13554                 STRLEN length;
13555                 char name = *RExC_parse;
13556                 char * endbrace = NULL;
13557                 RExC_parse += 2;
13558                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13559
13560                 if (! endbrace) {
13561                     vFAIL2("Missing right brace on \\%c{}", name);
13562                 }
13563                 /* XXX Need to decide whether to take spaces or not.  Should be
13564                  * consistent with \p{}, but that currently is SPACE, which
13565                  * means vertical too, which seems wrong
13566                  * while (isBLANK(*RExC_parse)) {
13567                     RExC_parse++;
13568                 }*/
13569                 if (endbrace == RExC_parse) {
13570                     RExC_parse++;  /* After the '}' */
13571                     vFAIL2("Empty \\%c{}", name);
13572                 }
13573                 length = endbrace - RExC_parse;
13574                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13575                     length--;
13576                 }*/
13577                 switch (*RExC_parse) {
13578                     case 'g':
13579                         if (    length != 1
13580                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13581                         {
13582                             goto bad_bound_type;
13583                         }
13584                         flags = GCB_BOUND;
13585                         break;
13586                     case 'l':
13587                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13588                             goto bad_bound_type;
13589                         }
13590                         flags = LB_BOUND;
13591                         break;
13592                     case 's':
13593                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13594                             goto bad_bound_type;
13595                         }
13596                         flags = SB_BOUND;
13597                         break;
13598                     case 'w':
13599                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13600                             goto bad_bound_type;
13601                         }
13602                         flags = WB_BOUND;
13603                         break;
13604                     default:
13605                       bad_bound_type:
13606                         RExC_parse = endbrace;
13607                         vFAIL2utf8f(
13608                             "'%" UTF8f "' is an unknown bound type",
13609                             UTF8fARG(UTF, length, endbrace - length));
13610                         NOT_REACHED; /*NOTREACHED*/
13611                 }
13612                 RExC_parse = endbrace;
13613                 REQUIRE_UNI_RULES(flagp, 0);
13614
13615                 if (op == BOUND) {
13616                     op = BOUNDU;
13617                 }
13618                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13619                     op = BOUNDU;
13620                     length += 4;
13621
13622                     /* Don't have to worry about UTF-8, in this message because
13623                      * to get here the contents of the \b must be ASCII */
13624                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13625                               "Using /u for '%.*s' instead of /%s",
13626                               (unsigned) length,
13627                               endbrace - length + 1,
13628                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13629                               ? ASCII_RESTRICT_PAT_MODS
13630                               : ASCII_MORE_RESTRICT_PAT_MODS);
13631                 }
13632             }
13633
13634             if (op == BOUND) {
13635                 RExC_seen_d_op = TRUE;
13636             }
13637             else if (op == BOUNDL) {
13638                 RExC_contains_locale = 1;
13639             }
13640
13641             if (invert) {
13642                 op += NBOUND - BOUND;
13643             }
13644
13645             ret = reg_node(pRExC_state, op);
13646             FLAGS(REGNODE_p(ret)) = flags;
13647
13648             *flagp |= SIMPLE;
13649
13650             goto finish_meta_pat;
13651           }
13652
13653         case 'D':
13654             invert = 1;
13655             /* FALLTHROUGH */
13656         case 'd':
13657             arg = ANYOF_DIGIT;
13658             if (! DEPENDS_SEMANTICS) {
13659                 goto join_posix;
13660             }
13661
13662             /* \d doesn't have any matches in the upper Latin1 range, hence /d
13663              * is equivalent to /u.  Changing to /u saves some branches at
13664              * runtime */
13665             op = POSIXU;
13666             goto join_posix_op_known;
13667
13668         case 'R':
13669             ret = reg_node(pRExC_state, LNBREAK);
13670             *flagp |= HASWIDTH|SIMPLE;
13671             goto finish_meta_pat;
13672
13673         case 'H':
13674             invert = 1;
13675             /* FALLTHROUGH */
13676         case 'h':
13677             arg = ANYOF_BLANK;
13678             op = POSIXU;
13679             goto join_posix_op_known;
13680
13681         case 'V':
13682             invert = 1;
13683             /* FALLTHROUGH */
13684         case 'v':
13685             arg = ANYOF_VERTWS;
13686             op = POSIXU;
13687             goto join_posix_op_known;
13688
13689         case 'S':
13690             invert = 1;
13691             /* FALLTHROUGH */
13692         case 's':
13693             arg = ANYOF_SPACE;
13694
13695           join_posix:
13696
13697             op = POSIXD + get_regex_charset(RExC_flags);
13698             if (op > POSIXA) {  /* /aa is same as /a */
13699                 op = POSIXA;
13700             }
13701             else if (op == POSIXL) {
13702                 RExC_contains_locale = 1;
13703             }
13704             else if (op == POSIXD) {
13705                 RExC_seen_d_op = TRUE;
13706             }
13707
13708           join_posix_op_known:
13709
13710             if (invert) {
13711                 op += NPOSIXD - POSIXD;
13712             }
13713
13714             ret = reg_node(pRExC_state, op);
13715             FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
13716
13717             *flagp |= HASWIDTH|SIMPLE;
13718             /* FALLTHROUGH */
13719
13720           finish_meta_pat:
13721             if (   UCHARAT(RExC_parse + 1) == '{'
13722                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13723             {
13724                 RExC_parse += 2;
13725                 vFAIL("Unescaped left brace in regex is illegal here");
13726             }
13727             nextchar(pRExC_state);
13728             Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
13729             break;
13730         case 'p':
13731         case 'P':
13732             RExC_parse--;
13733
13734             ret = regclass(pRExC_state, flagp, depth+1,
13735                            TRUE, /* means just parse this element */
13736                            FALSE, /* don't allow multi-char folds */
13737                            FALSE, /* don't silence non-portable warnings.  It
13738                                      would be a bug if these returned
13739                                      non-portables */
13740                            (bool) RExC_strict,
13741                            TRUE, /* Allow an optimized regnode result */
13742                            NULL);
13743             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13744             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13745              * multi-char folds are allowed.  */
13746             if (!ret)
13747                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13748                       (UV) *flagp);
13749
13750             RExC_parse--;
13751
13752             Set_Node_Offset(REGNODE_p(ret), parse_start);
13753             Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
13754             nextchar(pRExC_state);
13755             break;
13756         case 'N':
13757             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13758              * \N{...} evaluates to a sequence of more than one code points).
13759              * The function call below returns a regnode, which is our result.
13760              * The parameters cause it to fail if the \N{} evaluates to a
13761              * single code point; we handle those like any other literal.  The
13762              * reason that the multicharacter case is handled here and not as
13763              * part of the EXACtish code is because of quantifiers.  In
13764              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13765              * this way makes that Just Happen. dmq.
13766              * join_exact() will join this up with adjacent EXACTish nodes
13767              * later on, if appropriate. */
13768             ++RExC_parse;
13769             if (grok_bslash_N(pRExC_state,
13770                               &ret,     /* Want a regnode returned */
13771                               NULL,     /* Fail if evaluates to a single code
13772                                            point */
13773                               NULL,     /* Don't need a count of how many code
13774                                            points */
13775                               flagp,
13776                               RExC_strict,
13777                               depth)
13778             ) {
13779                 break;
13780             }
13781
13782             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13783
13784             /* Here, evaluates to a single code point.  Go get that */
13785             RExC_parse = parse_start;
13786             goto defchar;
13787
13788         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13789       parse_named_seq:
13790         {
13791             char ch;
13792             if (   RExC_parse >= RExC_end - 1
13793                 || ((   ch = RExC_parse[1]) != '<'
13794                                       && ch != '\''
13795                                       && ch != '{'))
13796             {
13797                 RExC_parse++;
13798                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13799                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13800             } else {
13801                 RExC_parse += 2;
13802                 ret = handle_named_backref(pRExC_state,
13803                                            flagp,
13804                                            parse_start,
13805                                            (ch == '<')
13806                                            ? '>'
13807                                            : (ch == '{')
13808                                              ? '}'
13809                                              : '\'');
13810             }
13811             break;
13812         }
13813         case 'g':
13814         case '1': case '2': case '3': case '4':
13815         case '5': case '6': case '7': case '8': case '9':
13816             {
13817                 I32 num;
13818                 bool hasbrace = 0;
13819
13820                 if (*RExC_parse == 'g') {
13821                     bool isrel = 0;
13822
13823                     RExC_parse++;
13824                     if (*RExC_parse == '{') {
13825                         RExC_parse++;
13826                         hasbrace = 1;
13827                     }
13828                     if (*RExC_parse == '-') {
13829                         RExC_parse++;
13830                         isrel = 1;
13831                     }
13832                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13833                         if (isrel) RExC_parse--;
13834                         RExC_parse -= 2;
13835                         goto parse_named_seq;
13836                     }
13837
13838                     if (RExC_parse >= RExC_end) {
13839                         goto unterminated_g;
13840                     }
13841                     num = S_backref_value(RExC_parse, RExC_end);
13842                     if (num == 0)
13843                         vFAIL("Reference to invalid group 0");
13844                     else if (num == I32_MAX) {
13845                          if (isDIGIT(*RExC_parse))
13846                             vFAIL("Reference to nonexistent group");
13847                         else
13848                           unterminated_g:
13849                             vFAIL("Unterminated \\g... pattern");
13850                     }
13851
13852                     if (isrel) {
13853                         num = RExC_npar - num;
13854                         if (num < 1)
13855                             vFAIL("Reference to nonexistent or unclosed group");
13856                     }
13857                 }
13858                 else {
13859                     num = S_backref_value(RExC_parse, RExC_end);
13860                     /* bare \NNN might be backref or octal - if it is larger
13861                      * than or equal RExC_npar then it is assumed to be an
13862                      * octal escape. Note RExC_npar is +1 from the actual
13863                      * number of parens. */
13864                     /* Note we do NOT check if num == I32_MAX here, as that is
13865                      * handled by the RExC_npar check */
13866
13867                     if (
13868                         /* any numeric escape < 10 is always a backref */
13869                         num > 9
13870                         /* any numeric escape < RExC_npar is a backref */
13871                         && num >= RExC_npar
13872                         /* cannot be an octal escape if it starts with 8 */
13873                         && *RExC_parse != '8'
13874                         /* cannot be an octal escape if it starts with 9 */
13875                         && *RExC_parse != '9'
13876                     ) {
13877                         /* Probably not meant to be a backref, instead likely
13878                          * to be an octal character escape, e.g. \35 or \777.
13879                          * The above logic should make it obvious why using
13880                          * octal escapes in patterns is problematic. - Yves */
13881                         RExC_parse = parse_start;
13882                         goto defchar;
13883                     }
13884                 }
13885
13886                 /* At this point RExC_parse points at a numeric escape like
13887                  * \12 or \88 or something similar, which we should NOT treat
13888                  * as an octal escape. It may or may not be a valid backref
13889                  * escape. For instance \88888888 is unlikely to be a valid
13890                  * backref. */
13891                 while (isDIGIT(*RExC_parse))
13892                     RExC_parse++;
13893                 if (hasbrace) {
13894                     if (*RExC_parse != '}')
13895                         vFAIL("Unterminated \\g{...} pattern");
13896                     RExC_parse++;
13897                 }
13898                 if (num >= (I32)RExC_npar) {
13899
13900                     /* It might be a forward reference; we can't fail until we
13901                      * know, by completing the parse to get all the groups, and
13902                      * then reparsing */
13903                     if (ALL_PARENS_COUNTED)  {
13904                         if (num >= RExC_total_parens)  {
13905                             vFAIL("Reference to nonexistent group");
13906                         }
13907                     }
13908                     else {
13909                         REQUIRE_PARENS_PASS;
13910                     }
13911                 }
13912                 RExC_sawback = 1;
13913                 ret = reganode(pRExC_state,
13914                                ((! FOLD)
13915                                  ? REF
13916                                  : (ASCII_FOLD_RESTRICTED)
13917                                    ? REFFA
13918                                    : (AT_LEAST_UNI_SEMANTICS)
13919                                      ? REFFU
13920                                      : (LOC)
13921                                        ? REFFL
13922                                        : REFF),
13923                                 num);
13924                 if (OP(REGNODE_p(ret)) == REFF) {
13925                     RExC_seen_d_op = TRUE;
13926                 }
13927                 *flagp |= HASWIDTH;
13928
13929                 /* override incorrect value set in reganode MJD */
13930                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13931                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13932                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13933                                         FALSE /* Don't force to /x */ );
13934             }
13935             break;
13936         case '\0':
13937             if (RExC_parse >= RExC_end)
13938                 FAIL("Trailing \\");
13939             /* FALLTHROUGH */
13940         default:
13941             /* Do not generate "unrecognized" warnings here, we fall
13942                back into the quick-grab loop below */
13943             RExC_parse = parse_start;
13944             goto defchar;
13945         } /* end of switch on a \foo sequence */
13946         break;
13947
13948     case '#':
13949
13950         /* '#' comments should have been spaced over before this function was
13951          * called */
13952         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13953         /*
13954         if (RExC_flags & RXf_PMf_EXTENDED) {
13955             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13956             if (RExC_parse < RExC_end)
13957                 goto tryagain;
13958         }
13959         */
13960
13961         /* FALLTHROUGH */
13962
13963     default:
13964           defchar: {
13965
13966             /* Here, we have determined that the next thing is probably a
13967              * literal character.  RExC_parse points to the first byte of its
13968              * definition.  (It still may be an escape sequence that evaluates
13969              * to a single character) */
13970
13971             STRLEN len = 0;
13972             UV ender = 0;
13973             char *p;
13974             char *s;
13975
13976 /* This allows us to fill a node with just enough spare so that if the final
13977  * character folds, its expansion is guaranteed to fit */
13978 #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
13979
13980             char *s0;
13981             U8 upper_parse = MAX_NODE_STRING_SIZE;
13982
13983             /* We start out as an EXACT node, even if under /i, until we find a
13984              * character which is in a fold.  The algorithm now segregates into
13985              * separate nodes, characters that fold from those that don't under
13986              * /i.  (This hopefully will create nodes that are fixed strings
13987              * even under /i, giving the optimizer something to grab on to.)
13988              * So, if a node has something in it and the next character is in
13989              * the opposite category, that node is closed up, and the function
13990              * returns.  Then regatom is called again, and a new node is
13991              * created for the new category. */
13992             U8 node_type = EXACT;
13993
13994             /* Assume the node will be fully used; the excess is given back at
13995              * the end.  We can't make any other length assumptions, as a byte
13996              * input sequence could shrink down. */
13997             Ptrdiff_t initial_size = STR_SZ(256);
13998
13999             bool next_is_quantifier;
14000             char * oldp = NULL;
14001
14002             /* We can convert EXACTF nodes to EXACTFU if they contain only
14003              * characters that match identically regardless of the target
14004              * string's UTF8ness.  The reason to do this is that EXACTF is not
14005              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14006              * runtime.
14007              *
14008              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14009              * contain only above-Latin1 characters (hence must be in UTF8),
14010              * which don't participate in folds with Latin1-range characters,
14011              * as the latter's folds aren't known until runtime. */
14012             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14013
14014             /* Single-character EXACTish nodes are almost always SIMPLE.  This
14015              * allows us to override this as encountered */
14016             U8 maybe_SIMPLE = SIMPLE;
14017
14018             /* Does this node contain something that can't match unless the
14019              * target string is (also) in UTF-8 */
14020             bool requires_utf8_target = FALSE;
14021
14022             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14023             bool has_ss = FALSE;
14024
14025             /* So is the MICRO SIGN */
14026             bool has_micro_sign = FALSE;
14027
14028             /* Allocate an EXACT node.  The node_type may change below to
14029              * another EXACTish node, but since the size of the node doesn't
14030              * change, it works */
14031             ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
14032             FILL_NODE(ret, node_type);
14033             RExC_emit++;
14034
14035             s = STRING(REGNODE_p(ret));
14036
14037             s0 = s;
14038
14039           reparse:
14040
14041             /* This breaks under rare circumstances.  If folding, we do not
14042              * want to split a node at a character that is a non-final in a
14043              * multi-char fold, as an input string could just happen to want to
14044              * match across the node boundary.  The code at the end of the loop
14045              * looks for this, and backs off until it finds not such a
14046              * character, but it is possible (though extremely, extremely
14047              * unlikely) for all characters in the node to be non-final fold
14048              * ones, in which case we just leave the node fully filled, and
14049              * hope that it doesn't match the string in just the wrong place */
14050
14051             assert( ! UTF     /* Is at the beginning of a character */
14052                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14053                    || UTF8_IS_START(UCHARAT(RExC_parse)));
14054
14055             /* Here, we have a literal character.  Find the maximal string of
14056              * them in the input that we can fit into a single EXACTish node.
14057              * We quit at the first non-literal or when the node gets full, or
14058              * under /i the categorization of folding/non-folding character
14059              * changes */
14060             for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
14061
14062                 /* In most cases each iteration adds one byte to the output.
14063                  * The exceptions override this */
14064                 Size_t added_len = 1;
14065
14066                 oldp = p;
14067
14068                 /* White space has already been ignored */
14069                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
14070                        || ! is_PATWS_safe((p), RExC_end, UTF));
14071
14072                 switch ((U8)*p) {
14073                 case '^':
14074                 case '$':
14075                 case '.':
14076                 case '[':
14077                 case '(':
14078                 case ')':
14079                 case '|':
14080                     goto loopdone;
14081                 case '\\':
14082                     /* Literal Escapes Switch
14083
14084                        This switch is meant to handle escape sequences that
14085                        resolve to a literal character.
14086
14087                        Every escape sequence that represents something
14088                        else, like an assertion or a char class, is handled
14089                        in the switch marked 'Special Escapes' above in this
14090                        routine, but also has an entry here as anything that
14091                        isn't explicitly mentioned here will be treated as
14092                        an unescaped equivalent literal.
14093                     */
14094
14095                     switch ((U8)*++p) {
14096
14097                     /* These are all the special escapes. */
14098                     case 'A':             /* Start assertion */
14099                     case 'b': case 'B':   /* Word-boundary assertion*/
14100                     case 'C':             /* Single char !DANGEROUS! */
14101                     case 'd': case 'D':   /* digit class */
14102                     case 'g': case 'G':   /* generic-backref, pos assertion */
14103                     case 'h': case 'H':   /* HORIZWS */
14104                     case 'k': case 'K':   /* named backref, keep marker */
14105                     case 'p': case 'P':   /* Unicode property */
14106                               case 'R':   /* LNBREAK */
14107                     case 's': case 'S':   /* space class */
14108                     case 'v': case 'V':   /* VERTWS */
14109                     case 'w': case 'W':   /* word class */
14110                     case 'X':             /* eXtended Unicode "combining
14111                                              character sequence" */
14112                     case 'z': case 'Z':   /* End of line/string assertion */
14113                         --p;
14114                         goto loopdone;
14115
14116                     /* Anything after here is an escape that resolves to a
14117                        literal. (Except digits, which may or may not)
14118                      */
14119                     case 'n':
14120                         ender = '\n';
14121                         p++;
14122                         break;
14123                     case 'N': /* Handle a single-code point named character. */
14124                         RExC_parse = p + 1;
14125                         if (! grok_bslash_N(pRExC_state,
14126                                             NULL,   /* Fail if evaluates to
14127                                                        anything other than a
14128                                                        single code point */
14129                                             &ender, /* The returned single code
14130                                                        point */
14131                                             NULL,   /* Don't need a count of
14132                                                        how many code points */
14133                                             flagp,
14134                                             RExC_strict,
14135                                             depth)
14136                         ) {
14137                             if (*flagp & NEED_UTF8)
14138                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14139                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14140
14141                             /* Here, it wasn't a single code point.  Go close
14142                              * up this EXACTish node.  The switch() prior to
14143                              * this switch handles the other cases */
14144                             RExC_parse = p = oldp;
14145                             goto loopdone;
14146                         }
14147                         p = RExC_parse;
14148                         RExC_parse = parse_start;
14149
14150                         /* The \N{} means the pattern, if previously /d,
14151                          * becomes /u.  That means it can't be an EXACTF node,
14152                          * but an EXACTFU */
14153                         if (node_type == EXACTF) {
14154                             node_type = EXACTFU;
14155
14156                             /* If the node already contains something that
14157                              * differs between EXACTF and EXACTFU, reparse it
14158                              * as EXACTFU */
14159                             if (! maybe_exactfu) {
14160                                 len = 0;
14161                                 s = s0;
14162                                 goto reparse;
14163                             }
14164                         }
14165
14166                         break;
14167                     case 'r':
14168                         ender = '\r';
14169                         p++;
14170                         break;
14171                     case 't':
14172                         ender = '\t';
14173                         p++;
14174                         break;
14175                     case 'f':
14176                         ender = '\f';
14177                         p++;
14178                         break;
14179                     case 'e':
14180                         ender = ESC_NATIVE;
14181                         p++;
14182                         break;
14183                     case 'a':
14184                         ender = '\a';
14185                         p++;
14186                         break;
14187                     case 'o':
14188                         {
14189                             UV result;
14190                             const char* error_msg;
14191
14192                             bool valid = grok_bslash_o(&p,
14193                                                        RExC_end,
14194                                                        &result,
14195                                                        &error_msg,
14196                                                        TO_OUTPUT_WARNINGS(p),
14197                                                        (bool) RExC_strict,
14198                                                        TRUE, /* Output warnings
14199                                                                 for non-
14200                                                                 portables */
14201                                                        UTF);
14202                             if (! valid) {
14203                                 RExC_parse = p; /* going to die anyway; point
14204                                                    to exact spot of failure */
14205                                 vFAIL(error_msg);
14206                             }
14207                             UPDATE_WARNINGS_LOC(p - 1);
14208                             ender = result;
14209                             break;
14210                         }
14211                     case 'x':
14212                         {
14213                             UV result = UV_MAX; /* initialize to erroneous
14214                                                    value */
14215                             const char* error_msg;
14216
14217                             bool valid = grok_bslash_x(&p,
14218                                                        RExC_end,
14219                                                        &result,
14220                                                        &error_msg,
14221                                                        TO_OUTPUT_WARNINGS(p),
14222                                                        (bool) RExC_strict,
14223                                                        TRUE, /* Silence warnings
14224                                                                 for non-
14225                                                                 portables */
14226                                                        UTF);
14227                             if (! valid) {
14228                                 RExC_parse = p; /* going to die anyway; point
14229                                                    to exact spot of failure */
14230                                 vFAIL(error_msg);
14231                             }
14232                             UPDATE_WARNINGS_LOC(p - 1);
14233                             ender = result;
14234
14235 #ifdef EBCDIC
14236                             if (ender < 0x100) {
14237                                 if (RExC_recode_x_to_native) {
14238                                     ender = LATIN1_TO_NATIVE(ender);
14239                                 }
14240                             }
14241 #endif
14242                             break;
14243                         }
14244                     case 'c':
14245                         p++;
14246                         ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14247                         UPDATE_WARNINGS_LOC(p);
14248                         p++;
14249                         break;
14250                     case '8': case '9': /* must be a backreference */
14251                         --p;
14252                         /* we have an escape like \8 which cannot be an octal escape
14253                          * so we exit the loop, and let the outer loop handle this
14254                          * escape which may or may not be a legitimate backref. */
14255                         goto loopdone;
14256                     case '1': case '2': case '3':case '4':
14257                     case '5': case '6': case '7':
14258                         /* When we parse backslash escapes there is ambiguity
14259                          * between backreferences and octal escapes. Any escape
14260                          * from \1 - \9 is a backreference, any multi-digit
14261                          * escape which does not start with 0 and which when
14262                          * evaluated as decimal could refer to an already
14263                          * parsed capture buffer is a back reference. Anything
14264                          * else is octal.
14265                          *
14266                          * Note this implies that \118 could be interpreted as
14267                          * 118 OR as "\11" . "8" depending on whether there
14268                          * were 118 capture buffers defined already in the
14269                          * pattern.  */
14270
14271                         /* NOTE, RExC_npar is 1 more than the actual number of
14272                          * parens we have seen so far, hence the "<" as opposed
14273                          * to "<=" */
14274                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14275                         {  /* Not to be treated as an octal constant, go
14276                                    find backref */
14277                             --p;
14278                             goto loopdone;
14279                         }
14280                         /* FALLTHROUGH */
14281                     case '0':
14282                         {
14283                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14284                             STRLEN numlen = 3;
14285                             ender = grok_oct(p, &numlen, &flags, NULL);
14286                             p += numlen;
14287                             if (   isDIGIT(*p)  /* like \08, \178 */
14288                                 && ckWARN(WARN_REGEXP)
14289                                 && numlen < 3)
14290                             {
14291                                 reg_warn_non_literal_string(
14292                                          p + 1,
14293                                          form_short_octal_warning(p, numlen));
14294                             }
14295                         }
14296                         break;
14297                     case '\0':
14298                         if (p >= RExC_end)
14299                             FAIL("Trailing \\");
14300                         /* FALLTHROUGH */
14301                     default:
14302                         if (isALPHANUMERIC(*p)) {
14303                             /* An alpha followed by '{' is going to fail next
14304                              * iteration, so don't output this warning in that
14305                              * case */
14306                             if (! isALPHA(*p) || *(p + 1) != '{') {
14307                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14308                                                   " passed through", p);
14309                             }
14310                         }
14311                         goto normal_default;
14312                     } /* End of switch on '\' */
14313                     break;
14314                 case '{':
14315                     /* Trying to gain new uses for '{' without breaking too
14316                      * much existing code is hard.  The solution currently
14317                      * adopted is:
14318                      *  1)  If there is no ambiguity that a '{' should always
14319                      *      be taken literally, at the start of a construct, we
14320                      *      just do so.
14321                      *  2)  If the literal '{' conflicts with our desired use
14322                      *      of it as a metacharacter, we die.  The deprecation
14323                      *      cycles for this have come and gone.
14324                      *  3)  If there is ambiguity, we raise a simple warning.
14325                      *      This could happen, for example, if the user
14326                      *      intended it to introduce a quantifier, but slightly
14327                      *      misspelled the quantifier.  Without this warning,
14328                      *      the quantifier would silently be taken as a literal
14329                      *      string of characters instead of a meta construct */
14330                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14331                         if (      RExC_strict
14332                             || (  p > parse_start + 1
14333                                 && isALPHA_A(*(p - 1))
14334                                 && *(p - 2) == '\\')
14335                             || new_regcurly(p, RExC_end))
14336                         {
14337                             RExC_parse = p + 1;
14338                             vFAIL("Unescaped left brace in regex is "
14339                                   "illegal here");
14340                         }
14341                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14342                                          " passed through");
14343                     }
14344                     goto normal_default;
14345                 case '}':
14346                 case ']':
14347                     if (p > RExC_parse && RExC_strict) {
14348                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14349                     }
14350                     /*FALLTHROUGH*/
14351                 default:    /* A literal character */
14352                   normal_default:
14353                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14354                         STRLEN numlen;
14355                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14356                                                &numlen, UTF8_ALLOW_DEFAULT);
14357                         p += numlen;
14358                     }
14359                     else
14360                         ender = (U8) *p++;
14361                     break;
14362                 } /* End of switch on the literal */
14363
14364                 /* Here, have looked at the literal character, and <ender>
14365                  * contains its ordinal; <p> points to the character after it.
14366                  * */
14367
14368                 if (ender > 255) {
14369                     REQUIRE_UTF8(flagp);
14370                 }
14371
14372                 /* We need to check if the next non-ignored thing is a
14373                  * quantifier.  Move <p> to after anything that should be
14374                  * ignored, which, as a side effect, positions <p> for the next
14375                  * loop iteration */
14376                 skip_to_be_ignored_text(pRExC_state, &p,
14377                                         FALSE /* Don't force to /x */ );
14378
14379                 /* If the next thing is a quantifier, it applies to this
14380                  * character only, which means that this character has to be in
14381                  * its own node and can't just be appended to the string in an
14382                  * existing node, so if there are already other characters in
14383                  * the node, close the node with just them, and set up to do
14384                  * this character again next time through, when it will be the
14385                  * only thing in its new node */
14386
14387                 next_is_quantifier =    LIKELY(p < RExC_end)
14388                                      && UNLIKELY(ISMULT2(p));
14389
14390                 if (next_is_quantifier && LIKELY(len)) {
14391                     p = oldp;
14392                     goto loopdone;
14393                 }
14394
14395                 /* Ready to add 'ender' to the node */
14396
14397                 if (! FOLD) {  /* The simple case, just append the literal */
14398
14399                       not_fold_common:
14400                         if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14401                             *(s++) = (char) ender;
14402                         }
14403                         else {
14404                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14405                             added_len = (char *) new_s - s;
14406                             s = (char *) new_s;
14407
14408                             if (ender > 255)  {
14409                                 requires_utf8_target = TRUE;
14410                             }
14411                         }
14412                 }
14413                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14414
14415                     /* Here are folding under /l, and the code point is
14416                      * problematic.  If this is the first character in the
14417                      * node, change the node type to folding.   Otherwise, if
14418                      * this is the first problematic character, close up the
14419                      * existing node, so can start a new node with this one */
14420                     if (! len) {
14421                         node_type = EXACTFL;
14422                         RExC_contains_locale = 1;
14423                     }
14424                     else if (node_type == EXACT) {
14425                         p = oldp;
14426                         goto loopdone;
14427                     }
14428
14429                     /* This problematic code point means we can't simplify
14430                      * things */
14431                     maybe_exactfu = FALSE;
14432
14433                     /* Here, we are adding a problematic fold character.
14434                      * "Problematic" in this context means that its fold isn't
14435                      * known until runtime.  (The non-problematic code points
14436                      * are the above-Latin1 ones that fold to also all
14437                      * above-Latin1.  Their folds don't vary no matter what the
14438                      * locale is.) But here we have characters whose fold
14439                      * depends on the locale.  We just add in the unfolded
14440                      * character, and wait until runtime to fold it */
14441                     goto not_fold_common;
14442                 }
14443                 else /* regular fold; see if actually is in a fold */
14444                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14445                          || (ender > 255
14446                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14447                 {
14448                     /* Here, folding, but the character isn't in a fold.
14449                      *
14450                      * Start a new node if previous characters in the node were
14451                      * folded */
14452                     if (len && node_type != EXACT) {
14453                         p = oldp;
14454                         goto loopdone;
14455                     }
14456
14457                     /* Here, continuing a node with non-folded characters.  Add
14458                      * this one */
14459                     goto not_fold_common;
14460                 }
14461                 else {  /* Here, does participate in some fold */
14462
14463                     /* If this is the first character in the node, change its
14464                      * type to folding.  Otherwise, if this is the first
14465                      * folding character in the node, close up the existing
14466                      * node, so can start a new node with this one.  */
14467                     if (! len) {
14468                         node_type = compute_EXACTish(pRExC_state);
14469                     }
14470                     else if (node_type == EXACT) {
14471                         p = oldp;
14472                         goto loopdone;
14473                     }
14474
14475                     if (UTF) {  /* Use the folded value */
14476                         if (UVCHR_IS_INVARIANT(ender)) {
14477                             *(s)++ = (U8) toFOLD(ender);
14478                         }
14479                         else {
14480                             ender = _to_uni_fold_flags(
14481                                     ender,
14482                                     (U8 *) s,
14483                                     &added_len,
14484                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14485                                                     ? FOLD_FLAGS_NOMIX_ASCII
14486                                                     : 0));
14487                             s += added_len;
14488
14489                             if (   ender > 255
14490                                 && LIKELY(ender != GREEK_SMALL_LETTER_MU))
14491                             {
14492                                 /* U+B5 folds to the MU, so its possible for a
14493                                  * non-UTF-8 target to match it */
14494                                 requires_utf8_target = TRUE;
14495                             }
14496                         }
14497                     }
14498                     else {
14499
14500                         /* Here is non-UTF8.  First, see if the character's
14501                          * fold differs between /d and /u. */
14502                         if (PL_fold[ender] != PL_fold_latin1[ender]) {
14503                             maybe_exactfu = FALSE;
14504                         }
14505
14506 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14507    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14508                                       || UNICODE_DOT_DOT_VERSION > 0)
14509
14510                         /* On non-ancient Unicode versions, this includes the
14511                          * multi-char fold SHARP S to 'ss' */
14512
14513                         if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
14514                                  || (   isALPHA_FOLD_EQ(ender, 's')
14515                                      && len > 0
14516                                      && isALPHA_FOLD_EQ(*(s-1), 's')))
14517                         {
14518                             /* Here, we have one of the following:
14519                              *  a)  a SHARP S.  This folds to 'ss' only under
14520                              *      /u rules.  If we are in that situation,
14521                              *      fold the SHARP S to 'ss'.  See the comments
14522                              *      for join_exact() as to why we fold this
14523                              *      non-UTF at compile time, and no others.
14524                              *  b)  'ss'.  When under /u, there's nothing
14525                              *      special needed to be done here.  The
14526                              *      previous iteration handled the first 's',
14527                              *      and this iteration will handle the second.
14528                              *      If, on the otherhand it's not /u, we have
14529                              *      to exclude the possibility of moving to /u,
14530                              *      so that we won't generate an unwanted
14531                              *      match, unless, at runtime, the target
14532                              *      string is in UTF-8.
14533                              * */
14534
14535                             has_ss = TRUE;
14536                             maybe_exactfu = FALSE;  /* Can't generate an
14537                                                        EXACTFU node (unless we
14538                                                        already are in one) */
14539                             if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14540                                 maybe_SIMPLE = 0;
14541                                 if (node_type == EXACTFU) {
14542                                     *(s++) = 's';
14543
14544                                     /* Let the code below add in the extra 's' */
14545                                     ender = 's';
14546                                     added_len = 2;
14547                                 }
14548                             }
14549                         }
14550 #endif
14551
14552                         else if (UNLIKELY(ender == MICRO_SIGN)) {
14553                             has_micro_sign = TRUE;
14554                         }
14555
14556                         *(s++) = (DEPENDS_SEMANTICS)
14557                                  ? (char) toFOLD(ender)
14558
14559                                    /* Under /u, the fold of any character in
14560                                     * the 0-255 range happens to be its
14561                                     * lowercase equivalent, except for LATIN
14562                                     * SMALL LETTER SHARP S, which was handled
14563                                     * above, and the MICRO SIGN, whose fold
14564                                     * requires UTF-8 to represent.  */
14565                                  : (char) toLOWER_L1(ender);
14566                     }
14567                 } /* End of adding current character to the node */
14568
14569                 len += added_len;
14570
14571                 if (next_is_quantifier) {
14572
14573                     /* Here, the next input is a quantifier, and to get here,
14574                      * the current character is the only one in the node. */
14575                     goto loopdone;
14576                 }
14577
14578             } /* End of loop through literal characters */
14579
14580             /* Here we have either exhausted the input or ran out of room in
14581              * the node.  (If we encountered a character that can't be in the
14582              * node, transfer is made directly to <loopdone>, and so we
14583              * wouldn't have fallen off the end of the loop.)  In the latter
14584              * case, we artificially have to split the node into two, because
14585              * we just don't have enough space to hold everything.  This
14586              * creates a problem if the final character participates in a
14587              * multi-character fold in the non-final position, as a match that
14588              * should have occurred won't, due to the way nodes are matched,
14589              * and our artificial boundary.  So back off until we find a non-
14590              * problematic character -- one that isn't at the beginning or
14591              * middle of such a fold.  (Either it doesn't participate in any
14592              * folds, or appears only in the final position of all the folds it
14593              * does participate in.)  A better solution with far fewer false
14594              * positives, and that would fill the nodes more completely, would
14595              * be to actually have available all the multi-character folds to
14596              * test against, and to back-off only far enough to be sure that
14597              * this node isn't ending with a partial one.  <upper_parse> is set
14598              * further below (if we need to reparse the node) to include just
14599              * up through that final non-problematic character that this code
14600              * identifies, so when it is set to less than the full node, we can
14601              * skip the rest of this */
14602             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
14603                 PERL_UINT_FAST8_T backup_count = 0;
14604
14605                 const STRLEN full_len = len;
14606
14607                 assert(len >= MAX_NODE_STRING_SIZE);
14608
14609                 /* Here, <s> points to just beyond where we have output the
14610                  * final character of the node.  Look backwards through the
14611                  * string until find a non- problematic character */
14612
14613                 if (! UTF) {
14614
14615                     /* This has no multi-char folds to non-UTF characters */
14616                     if (ASCII_FOLD_RESTRICTED) {
14617                         goto loopdone;
14618                     }
14619
14620                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
14621                         backup_count++;
14622                     }
14623                     len = s - s0 + 1;
14624                 }
14625                 else {
14626
14627                     /* Point to the first byte of the final character */
14628                     s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
14629
14630                     while (s >= s0) {   /* Search backwards until find
14631                                            a non-problematic char */
14632                         if (UTF8_IS_INVARIANT(*s)) {
14633
14634                             /* There are no ascii characters that participate
14635                              * in multi-char folds under /aa.  In EBCDIC, the
14636                              * non-ascii invariants are all control characters,
14637                              * so don't ever participate in any folds. */
14638                             if (ASCII_FOLD_RESTRICTED
14639                                 || ! IS_NON_FINAL_FOLD(*s))
14640                             {
14641                                 break;
14642                             }
14643                         }
14644                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
14645                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
14646                                                                   *s, *(s+1))))
14647                             {
14648                                 break;
14649                             }
14650                         }
14651                         else if (! _invlist_contains_cp(
14652                                         PL_NonFinalFold,
14653                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
14654                         {
14655                             break;
14656                         }
14657
14658                         /* Here, the current character is problematic in that
14659                          * it does occur in the non-final position of some
14660                          * fold, so try the character before it, but have to
14661                          * special case the very first byte in the string, so
14662                          * we don't read outside the string */
14663                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
14664                         backup_count++;
14665                     } /* End of loop backwards through the string */
14666
14667                     /* If there were only problematic characters in the string,
14668                      * <s> will point to before s0, in which case the length
14669                      * should be 0, otherwise include the length of the
14670                      * non-problematic character just found */
14671                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
14672                 }
14673
14674                 /* Here, have found the final character, if any, that is
14675                  * non-problematic as far as ending the node without splitting
14676                  * it across a potential multi-char fold.  <len> contains the
14677                  * number of bytes in the node up-to and including that
14678                  * character, or is 0 if there is no such character, meaning
14679                  * the whole node contains only problematic characters.  In
14680                  * this case, give up and just take the node as-is.  We can't
14681                  * do any better */
14682                 if (len == 0) {
14683                     len = full_len;
14684
14685                 } else {
14686
14687                     /* Here, the node does contain some characters that aren't
14688                      * problematic.  If we didn't have to backup any, then the
14689                      * final character in the node is non-problematic, and we
14690                      * can take the node as-is */
14691                     if (backup_count == 0) {
14692                         goto loopdone;
14693                     }
14694                     else if (backup_count == 1) {
14695
14696                         /* If the final character is problematic, but the
14697                          * penultimate is not, back-off that last character to
14698                          * later start a new node with it */
14699                         p = oldp;
14700                         goto loopdone;
14701                     }
14702
14703                     /* Here, the final non-problematic character is earlier
14704                      * in the input than the penultimate character.  What we do
14705                      * is reparse from the beginning, going up only as far as
14706                      * this final ok one, thus guaranteeing that the node ends
14707                      * in an acceptable character.  The reason we reparse is
14708                      * that we know how far in the character is, but we don't
14709                      * know how to correlate its position with the input parse.
14710                      * An alternate implementation would be to build that
14711                      * correlation as we go along during the original parse,
14712                      * but that would entail extra work for every node, whereas
14713                      * this code gets executed only when the string is too
14714                      * large for the node, and the final two characters are
14715                      * problematic, an infrequent occurrence.  Yet another
14716                      * possible strategy would be to save the tail of the
14717                      * string, and the next time regatom is called, initialize
14718                      * with that.  The problem with this is that unless you
14719                      * back off one more character, you won't be guaranteed
14720                      * regatom will get called again, unless regbranch,
14721                      * regpiece ... are also changed.  If you do back off that
14722                      * extra character, so that there is input guaranteed to
14723                      * force calling regatom, you can't handle the case where
14724                      * just the first character in the node is acceptable.  I
14725                      * (khw) decided to try this method which doesn't have that
14726                      * pitfall; if performance issues are found, we can do a
14727                      * combination of the current approach plus that one */
14728                     upper_parse = len;
14729                     len = 0;
14730                     s = s0;
14731                     goto reparse;
14732                 }
14733             }   /* End of verifying node ends with an appropriate char */
14734
14735           loopdone:   /* Jumped to when encounters something that shouldn't be
14736                          in the node */
14737
14738             /* Free up any over-allocated space; cast is to silence bogus
14739              * warning in MS VC */
14740             change_engine_size(pRExC_state,
14741                                 - (Ptrdiff_t) (initial_size - STR_SZ(len)));
14742
14743             /* I (khw) don't know if you can get here with zero length, but the
14744              * old code handled this situation by creating a zero-length EXACT
14745              * node.  Might as well be NOTHING instead */
14746             if (len == 0) {
14747                 OP(REGNODE_p(ret)) = NOTHING;
14748             }
14749             else {
14750
14751                 /* If the node type is EXACT here, check to see if it
14752                  * should be EXACTL, or EXACT_ONLY8. */
14753                 if (node_type == EXACT) {
14754                     if (LOC) {
14755                         node_type = EXACTL;
14756                     }
14757                     else if (requires_utf8_target) {
14758                         node_type = EXACT_ONLY8;
14759                     }
14760                 } else if (FOLD) {
14761                     if (    UNLIKELY(has_micro_sign || has_ss)
14762                         && (node_type == EXACTFU || (   node_type == EXACTF
14763                                                      && maybe_exactfu)))
14764                     {   /* These two conditions are problematic in non-UTF-8
14765                            EXACTFU nodes. */
14766                         assert(! UTF);
14767                         node_type = EXACTFUP;
14768                     }
14769                     else if (node_type == EXACTFL) {
14770
14771                         /* 'maybe_exactfu' is deliberately set above to
14772                          * indicate this node type, where all code points in it
14773                          * are above 255 */
14774                         if (maybe_exactfu) {
14775                             node_type = EXACTFLU8;
14776                         }
14777                     }
14778                     else if (node_type == EXACTF) {  /* Means is /di */
14779
14780                         /* If 'maybe_exactfu' is clear, then we need to stay
14781                          * /di.  If it is set, it means there are no code
14782                          * points that match differently depending on UTF8ness
14783                          * of the target string, so it can become an EXACTFU
14784                          * node */
14785                         if (! maybe_exactfu) {
14786                             RExC_seen_d_op = TRUE;
14787                         }
14788                         else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
14789                                  || isALPHA_FOLD_EQ(ender, 's'))
14790                         {
14791                             /* But, if the node begins or ends in an 's' we
14792                              * have to defer changing it into an EXACTFU, as
14793                              * the node could later get joined with another one
14794                              * that ends or begins with 's' creating an 'ss'
14795                              * sequence which would then wrongly match the
14796                              * sharp s without the target being UTF-8.  We
14797                              * create a special node that we resolve later when
14798                              * we join nodes together */
14799
14800                             node_type = EXACTFU_S_EDGE;
14801                         }
14802                         else {
14803                             node_type = EXACTFU;
14804                         }
14805                     }
14806
14807                     if (requires_utf8_target && node_type == EXACTFU) {
14808                         node_type = EXACTFU_ONLY8;
14809                     }
14810                 }
14811
14812                 OP(REGNODE_p(ret)) = node_type;
14813                 STR_LEN(REGNODE_p(ret)) = len;
14814                 RExC_emit += STR_SZ(len);
14815
14816                 /* If the node isn't a single character, it can't be SIMPLE */
14817                 if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
14818                     maybe_SIMPLE = 0;
14819                 }
14820
14821                 *flagp |= HASWIDTH | maybe_SIMPLE;
14822             }
14823
14824             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
14825             RExC_parse = p;
14826
14827             {
14828                 /* len is STRLEN which is unsigned, need to copy to signed */
14829                 IV iv = len;
14830                 if (iv < 0)
14831                     vFAIL("Internal disaster");
14832             }
14833
14834         } /* End of label 'defchar:' */
14835         break;
14836     } /* End of giant switch on input character */
14837
14838     /* Position parse to next real character */
14839     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14840                                             FALSE /* Don't force to /x */ );
14841     if (   *RExC_parse == '{'
14842         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
14843     {
14844         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
14845             RExC_parse++;
14846             vFAIL("Unescaped left brace in regex is illegal here");
14847         }
14848         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
14849                                   " passed through");
14850     }
14851
14852     return(ret);
14853 }
14854
14855
14856 STATIC void
14857 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14858 {
14859     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
14860      * sets up the bitmap and any flags, removing those code points from the
14861      * inversion list, setting it to NULL should it become completely empty */
14862
14863     dVAR;
14864
14865     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14866     assert(PL_regkind[OP(node)] == ANYOF);
14867
14868     /* There is no bitmap for this node type */
14869     if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
14870         return;
14871     }
14872
14873     ANYOF_BITMAP_ZERO(node);
14874     if (*invlist_ptr) {
14875
14876         /* This gets set if we actually need to modify things */
14877         bool change_invlist = FALSE;
14878
14879         UV start, end;
14880
14881         /* Start looking through *invlist_ptr */
14882         invlist_iterinit(*invlist_ptr);
14883         while (invlist_iternext(*invlist_ptr, &start, &end)) {
14884             UV high;
14885             int i;
14886
14887             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14888                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14889             }
14890
14891             /* Quit if are above what we should change */
14892             if (start >= NUM_ANYOF_CODE_POINTS) {
14893                 break;
14894             }
14895
14896             change_invlist = TRUE;
14897
14898             /* Set all the bits in the range, up to the max that we are doing */
14899             high = (end < NUM_ANYOF_CODE_POINTS - 1)
14900                    ? end
14901                    : NUM_ANYOF_CODE_POINTS - 1;
14902             for (i = start; i <= (int) high; i++) {
14903                 if (! ANYOF_BITMAP_TEST(node, i)) {
14904                     ANYOF_BITMAP_SET(node, i);
14905                 }
14906             }
14907         }
14908         invlist_iterfinish(*invlist_ptr);
14909
14910         /* Done with loop; remove any code points that are in the bitmap from
14911          * *invlist_ptr; similarly for code points above the bitmap if we have
14912          * a flag to match all of them anyways */
14913         if (change_invlist) {
14914             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14915         }
14916         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14917             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14918         }
14919
14920         /* If have completely emptied it, remove it completely */
14921         if (_invlist_len(*invlist_ptr) == 0) {
14922             SvREFCNT_dec_NN(*invlist_ptr);
14923             *invlist_ptr = NULL;
14924         }
14925     }
14926 }
14927
14928 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14929    Character classes ([:foo:]) can also be negated ([:^foo:]).
14930    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14931    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14932    but trigger failures because they are currently unimplemented. */
14933
14934 #define POSIXCC_DONE(c)   ((c) == ':')
14935 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14936 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14937 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14938
14939 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
14940 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
14941 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
14942
14943 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14944
14945 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14946  * routine. q.v. */
14947 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
14948         if (posix_warnings) {                                               \
14949             if (! RExC_warn_text ) RExC_warn_text =                         \
14950                                          (AV *) sv_2mortal((SV *) newAV()); \
14951             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
14952                                              WARNING_PREFIX                 \
14953                                              text                           \
14954                                              REPORT_LOCATION,               \
14955                                              REPORT_LOCATION_ARGS(p)));     \
14956         }                                                                   \
14957     } STMT_END
14958 #define CLEAR_POSIX_WARNINGS()                                              \
14959     STMT_START {                                                            \
14960         if (posix_warnings && RExC_warn_text)                               \
14961             av_clear(RExC_warn_text);                                       \
14962     } STMT_END
14963
14964 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
14965     STMT_START {                                                            \
14966         CLEAR_POSIX_WARNINGS();                                             \
14967         return ret;                                                         \
14968     } STMT_END
14969
14970 STATIC int
14971 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14972
14973     const char * const s,      /* Where the putative posix class begins.
14974                                   Normally, this is one past the '['.  This
14975                                   parameter exists so it can be somewhere
14976                                   besides RExC_parse. */
14977     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14978                                   NULL */
14979     AV ** posix_warnings,      /* Where to place any generated warnings, or
14980                                   NULL */
14981     const bool check_only      /* Don't die if error */
14982 )
14983 {
14984     /* This parses what the caller thinks may be one of the three POSIX
14985      * constructs:
14986      *  1) a character class, like [:blank:]
14987      *  2) a collating symbol, like [. .]
14988      *  3) an equivalence class, like [= =]
14989      * In the latter two cases, it croaks if it finds a syntactically legal
14990      * one, as these are not handled by Perl.
14991      *
14992      * The main purpose is to look for a POSIX character class.  It returns:
14993      *  a) the class number
14994      *      if it is a completely syntactically and semantically legal class.
14995      *      'updated_parse_ptr', if not NULL, is set to point to just after the
14996      *      closing ']' of the class
14997      *  b) OOB_NAMEDCLASS
14998      *      if it appears that one of the three POSIX constructs was meant, but
14999      *      its specification was somehow defective.  'updated_parse_ptr', if
15000      *      not NULL, is set to point to the character just after the end
15001      *      character of the class.  See below for handling of warnings.
15002      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15003      *      if it  doesn't appear that a POSIX construct was intended.
15004      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15005      *      raised.
15006      *
15007      * In b) there may be errors or warnings generated.  If 'check_only' is
15008      * TRUE, then any errors are discarded.  Warnings are returned to the
15009      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15010      * instead it is NULL, warnings are suppressed.
15011      *
15012      * The reason for this function, and its complexity is that a bracketed
15013      * character class can contain just about anything.  But it's easy to
15014      * mistype the very specific posix class syntax but yielding a valid
15015      * regular bracketed class, so it silently gets compiled into something
15016      * quite unintended.
15017      *
15018      * The solution adopted here maintains backward compatibility except that
15019      * it adds a warning if it looks like a posix class was intended but
15020      * improperly specified.  The warning is not raised unless what is input
15021      * very closely resembles one of the 14 legal posix classes.  To do this,
15022      * it uses fuzzy parsing.  It calculates how many single-character edits it
15023      * would take to transform what was input into a legal posix class.  Only
15024      * if that number is quite small does it think that the intention was a
15025      * posix class.  Obviously these are heuristics, and there will be cases
15026      * where it errs on one side or another, and they can be tweaked as
15027      * experience informs.
15028      *
15029      * The syntax for a legal posix class is:
15030      *
15031      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15032      *
15033      * What this routine considers syntactically to be an intended posix class
15034      * is this (the comments indicate some restrictions that the pattern
15035      * doesn't show):
15036      *
15037      *  qr/(?x: \[?                         # The left bracket, possibly
15038      *                                      # omitted
15039      *          \h*                         # possibly followed by blanks
15040      *          (?: \^ \h* )?               # possibly a misplaced caret
15041      *          [:;]?                       # The opening class character,
15042      *                                      # possibly omitted.  A typo
15043      *                                      # semi-colon can also be used.
15044      *          \h*
15045      *          \^?                         # possibly a correctly placed
15046      *                                      # caret, but not if there was also
15047      *                                      # a misplaced one
15048      *          \h*
15049      *          .{3,15}                     # The class name.  If there are
15050      *                                      # deviations from the legal syntax,
15051      *                                      # its edit distance must be close
15052      *                                      # to a real class name in order
15053      *                                      # for it to be considered to be
15054      *                                      # an intended posix class.
15055      *          \h*
15056      *          [[:punct:]]?                # The closing class character,
15057      *                                      # possibly omitted.  If not a colon
15058      *                                      # nor semi colon, the class name
15059      *                                      # must be even closer to a valid
15060      *                                      # one
15061      *          \h*
15062      *          \]?                         # The right bracket, possibly
15063      *                                      # omitted.
15064      *     )/
15065      *
15066      * In the above, \h must be ASCII-only.
15067      *
15068      * These are heuristics, and can be tweaked as field experience dictates.
15069      * There will be cases when someone didn't intend to specify a posix class
15070      * that this warns as being so.  The goal is to minimize these, while
15071      * maximizing the catching of things intended to be a posix class that
15072      * aren't parsed as such.
15073      */
15074
15075     const char* p             = s;
15076     const char * const e      = RExC_end;
15077     unsigned complement       = 0;      /* If to complement the class */
15078     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15079     bool has_opening_bracket  = FALSE;
15080     bool has_opening_colon    = FALSE;
15081     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15082                                                    valid class */
15083     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15084     const char* name_start;             /* ptr to class name first char */
15085
15086     /* If the number of single-character typos the input name is away from a
15087      * legal name is no more than this number, it is considered to have meant
15088      * the legal name */
15089     int max_distance          = 2;
15090
15091     /* to store the name.  The size determines the maximum length before we
15092      * decide that no posix class was intended.  Should be at least
15093      * sizeof("alphanumeric") */
15094     UV input_text[15];
15095     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15096
15097     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15098
15099     CLEAR_POSIX_WARNINGS();
15100
15101     if (p >= e) {
15102         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15103     }
15104
15105     if (*(p - 1) != '[') {
15106         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15107         found_problem = TRUE;
15108     }
15109     else {
15110         has_opening_bracket = TRUE;
15111     }
15112
15113     /* They could be confused and think you can put spaces between the
15114      * components */
15115     if (isBLANK(*p)) {
15116         found_problem = TRUE;
15117
15118         do {
15119             p++;
15120         } while (p < e && isBLANK(*p));
15121
15122         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15123     }
15124
15125     /* For [. .] and [= =].  These are quite different internally from [: :],
15126      * so they are handled separately.  */
15127     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15128                                             and 1 for at least one char in it
15129                                           */
15130     {
15131         const char open_char  = *p;
15132         const char * temp_ptr = p + 1;
15133
15134         /* These two constructs are not handled by perl, and if we find a
15135          * syntactically valid one, we croak.  khw, who wrote this code, finds
15136          * this explanation of them very unclear:
15137          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15138          * And searching the rest of the internet wasn't very helpful either.
15139          * It looks like just about any byte can be in these constructs,
15140          * depending on the locale.  But unless the pattern is being compiled
15141          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15142          * In that case, it looks like [= =] isn't allowed at all, and that
15143          * [. .] could be any single code point, but for longer strings the
15144          * constituent characters would have to be the ASCII alphabetics plus
15145          * the minus-hyphen.  Any sensible locale definition would limit itself
15146          * to these.  And any portable one definitely should.  Trying to parse
15147          * the general case is a nightmare (see [perl #127604]).  So, this code
15148          * looks only for interiors of these constructs that match:
15149          *      qr/.|[-\w]{2,}/
15150          * Using \w relaxes the apparent rules a little, without adding much
15151          * danger of mistaking something else for one of these constructs.
15152          *
15153          * [. .] in some implementations described on the internet is usable to
15154          * escape a character that otherwise is special in bracketed character
15155          * classes.  For example [.].] means a literal right bracket instead of
15156          * the ending of the class
15157          *
15158          * [= =] can legitimately contain a [. .] construct, but we don't
15159          * handle this case, as that [. .] construct will later get parsed
15160          * itself and croak then.  And [= =] is checked for even when not under
15161          * /l, as Perl has long done so.
15162          *
15163          * The code below relies on there being a trailing NUL, so it doesn't
15164          * have to keep checking if the parse ptr < e.
15165          */
15166         if (temp_ptr[1] == open_char) {
15167             temp_ptr++;
15168         }
15169         else while (    temp_ptr < e
15170                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15171         {
15172             temp_ptr++;
15173         }
15174
15175         if (*temp_ptr == open_char) {
15176             temp_ptr++;
15177             if (*temp_ptr == ']') {
15178                 temp_ptr++;
15179                 if (! found_problem && ! check_only) {
15180                     RExC_parse = (char *) temp_ptr;
15181                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15182                             "extensions", open_char, open_char);
15183                 }
15184
15185                 /* Here, the syntax wasn't completely valid, or else the call
15186                  * is to check-only */
15187                 if (updated_parse_ptr) {
15188                     *updated_parse_ptr = (char *) temp_ptr;
15189                 }
15190
15191                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15192             }
15193         }
15194
15195         /* If we find something that started out to look like one of these
15196          * constructs, but isn't, we continue below so that it can be checked
15197          * for being a class name with a typo of '.' or '=' instead of a colon.
15198          * */
15199     }
15200
15201     /* Here, we think there is a possibility that a [: :] class was meant, and
15202      * we have the first real character.  It could be they think the '^' comes
15203      * first */
15204     if (*p == '^') {
15205         found_problem = TRUE;
15206         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15207         complement = 1;
15208         p++;
15209
15210         if (isBLANK(*p)) {
15211             found_problem = TRUE;
15212
15213             do {
15214                 p++;
15215             } while (p < e && isBLANK(*p));
15216
15217             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15218         }
15219     }
15220
15221     /* But the first character should be a colon, which they could have easily
15222      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15223      * distinguish from a colon, so treat that as a colon).  */
15224     if (*p == ':') {
15225         p++;
15226         has_opening_colon = TRUE;
15227     }
15228     else if (*p == ';') {
15229         found_problem = TRUE;
15230         p++;
15231         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15232         has_opening_colon = TRUE;
15233     }
15234     else {
15235         found_problem = TRUE;
15236         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15237
15238         /* Consider an initial punctuation (not one of the recognized ones) to
15239          * be a left terminator */
15240         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15241             p++;
15242         }
15243     }
15244
15245     /* They may think that you can put spaces between the components */
15246     if (isBLANK(*p)) {
15247         found_problem = TRUE;
15248
15249         do {
15250             p++;
15251         } while (p < e && isBLANK(*p));
15252
15253         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15254     }
15255
15256     if (*p == '^') {
15257
15258         /* We consider something like [^:^alnum:]] to not have been intended to
15259          * be a posix class, but XXX maybe we should */
15260         if (complement) {
15261             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15262         }
15263
15264         complement = 1;
15265         p++;
15266     }
15267
15268     /* Again, they may think that you can put spaces between the components */
15269     if (isBLANK(*p)) {
15270         found_problem = TRUE;
15271
15272         do {
15273             p++;
15274         } while (p < e && isBLANK(*p));
15275
15276         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15277     }
15278
15279     if (*p == ']') {
15280
15281         /* XXX This ']' may be a typo, and something else was meant.  But
15282          * treating it as such creates enough complications, that that
15283          * possibility isn't currently considered here.  So we assume that the
15284          * ']' is what is intended, and if we've already found an initial '[',
15285          * this leaves this construct looking like [:] or [:^], which almost
15286          * certainly weren't intended to be posix classes */
15287         if (has_opening_bracket) {
15288             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15289         }
15290
15291         /* But this function can be called when we parse the colon for
15292          * something like qr/[alpha:]]/, so we back up to look for the
15293          * beginning */
15294         p--;
15295
15296         if (*p == ';') {
15297             found_problem = TRUE;
15298             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15299         }
15300         else if (*p != ':') {
15301
15302             /* XXX We are currently very restrictive here, so this code doesn't
15303              * consider the possibility that, say, /[alpha.]]/ was intended to
15304              * be a posix class. */
15305             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15306         }
15307
15308         /* Here we have something like 'foo:]'.  There was no initial colon,
15309          * and we back up over 'foo.  XXX Unlike the going forward case, we
15310          * don't handle typos of non-word chars in the middle */
15311         has_opening_colon = FALSE;
15312         p--;
15313
15314         while (p > RExC_start && isWORDCHAR(*p)) {
15315             p--;
15316         }
15317         p++;
15318
15319         /* Here, we have positioned ourselves to where we think the first
15320          * character in the potential class is */
15321     }
15322
15323     /* Now the interior really starts.  There are certain key characters that
15324      * can end the interior, or these could just be typos.  To catch both
15325      * cases, we may have to do two passes.  In the first pass, we keep on
15326      * going unless we come to a sequence that matches
15327      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15328      * This means it takes a sequence to end the pass, so two typos in a row if
15329      * that wasn't what was intended.  If the class is perfectly formed, just
15330      * this one pass is needed.  We also stop if there are too many characters
15331      * being accumulated, but this number is deliberately set higher than any
15332      * real class.  It is set high enough so that someone who thinks that
15333      * 'alphanumeric' is a correct name would get warned that it wasn't.
15334      * While doing the pass, we keep track of where the key characters were in
15335      * it.  If we don't find an end to the class, and one of the key characters
15336      * was found, we redo the pass, but stop when we get to that character.
15337      * Thus the key character was considered a typo in the first pass, but a
15338      * terminator in the second.  If two key characters are found, we stop at
15339      * the second one in the first pass.  Again this can miss two typos, but
15340      * catches a single one
15341      *
15342      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15343      * point to the first key character.  For the second pass, it starts as -1.
15344      * */
15345
15346     name_start = p;
15347   parse_name:
15348     {
15349         bool has_blank               = FALSE;
15350         bool has_upper               = FALSE;
15351         bool has_terminating_colon   = FALSE;
15352         bool has_terminating_bracket = FALSE;
15353         bool has_semi_colon          = FALSE;
15354         unsigned int name_len        = 0;
15355         int punct_count              = 0;
15356
15357         while (p < e) {
15358
15359             /* Squeeze out blanks when looking up the class name below */
15360             if (isBLANK(*p) ) {
15361                 has_blank = TRUE;
15362                 found_problem = TRUE;
15363                 p++;
15364                 continue;
15365             }
15366
15367             /* The name will end with a punctuation */
15368             if (isPUNCT(*p)) {
15369                 const char * peek = p + 1;
15370
15371                 /* Treat any non-']' punctuation followed by a ']' (possibly
15372                  * with intervening blanks) as trying to terminate the class.
15373                  * ']]' is very likely to mean a class was intended (but
15374                  * missing the colon), but the warning message that gets
15375                  * generated shows the error position better if we exit the
15376                  * loop at the bottom (eventually), so skip it here. */
15377                 if (*p != ']') {
15378                     if (peek < e && isBLANK(*peek)) {
15379                         has_blank = TRUE;
15380                         found_problem = TRUE;
15381                         do {
15382                             peek++;
15383                         } while (peek < e && isBLANK(*peek));
15384                     }
15385
15386                     if (peek < e && *peek == ']') {
15387                         has_terminating_bracket = TRUE;
15388                         if (*p == ':') {
15389                             has_terminating_colon = TRUE;
15390                         }
15391                         else if (*p == ';') {
15392                             has_semi_colon = TRUE;
15393                             has_terminating_colon = TRUE;
15394                         }
15395                         else {
15396                             found_problem = TRUE;
15397                         }
15398                         p = peek + 1;
15399                         goto try_posix;
15400                     }
15401                 }
15402
15403                 /* Here we have punctuation we thought didn't end the class.
15404                  * Keep track of the position of the key characters that are
15405                  * more likely to have been class-enders */
15406                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15407
15408                     /* Allow just one such possible class-ender not actually
15409                      * ending the class. */
15410                     if (possible_end) {
15411                         break;
15412                     }
15413                     possible_end = p;
15414                 }
15415
15416                 /* If we have too many punctuation characters, no use in
15417                  * keeping going */
15418                 if (++punct_count > max_distance) {
15419                     break;
15420                 }
15421
15422                 /* Treat the punctuation as a typo. */
15423                 input_text[name_len++] = *p;
15424                 p++;
15425             }
15426             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15427                 input_text[name_len++] = toLOWER(*p);
15428                 has_upper = TRUE;
15429                 found_problem = TRUE;
15430                 p++;
15431             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15432                 input_text[name_len++] = *p;
15433                 p++;
15434             }
15435             else {
15436                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15437                 p+= UTF8SKIP(p);
15438             }
15439
15440             /* The declaration of 'input_text' is how long we allow a potential
15441              * class name to be, before saying they didn't mean a class name at
15442              * all */
15443             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15444                 break;
15445             }
15446         }
15447
15448         /* We get to here when the possible class name hasn't been properly
15449          * terminated before:
15450          *   1) we ran off the end of the pattern; or
15451          *   2) found two characters, each of which might have been intended to
15452          *      be the name's terminator
15453          *   3) found so many punctuation characters in the purported name,
15454          *      that the edit distance to a valid one is exceeded
15455          *   4) we decided it was more characters than anyone could have
15456          *      intended to be one. */
15457
15458         found_problem = TRUE;
15459
15460         /* In the final two cases, we know that looking up what we've
15461          * accumulated won't lead to a match, even a fuzzy one. */
15462         if (   name_len >= C_ARRAY_LENGTH(input_text)
15463             || punct_count > max_distance)
15464         {
15465             /* If there was an intermediate key character that could have been
15466              * an intended end, redo the parse, but stop there */
15467             if (possible_end && possible_end != (char *) -1) {
15468                 possible_end = (char *) -1; /* Special signal value to say
15469                                                we've done a first pass */
15470                 p = name_start;
15471                 goto parse_name;
15472             }
15473
15474             /* Otherwise, it can't have meant to have been a class */
15475             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15476         }
15477
15478         /* If we ran off the end, and the final character was a punctuation
15479          * one, back up one, to look at that final one just below.  Later, we
15480          * will restore the parse pointer if appropriate */
15481         if (name_len && p == e && isPUNCT(*(p-1))) {
15482             p--;
15483             name_len--;
15484         }
15485
15486         if (p < e && isPUNCT(*p)) {
15487             if (*p == ']') {
15488                 has_terminating_bracket = TRUE;
15489
15490                 /* If this is a 2nd ']', and the first one is just below this
15491                  * one, consider that to be the real terminator.  This gives a
15492                  * uniform and better positioning for the warning message  */
15493                 if (   possible_end
15494                     && possible_end != (char *) -1
15495                     && *possible_end == ']'
15496                     && name_len && input_text[name_len - 1] == ']')
15497                 {
15498                     name_len--;
15499                     p = possible_end;
15500
15501                     /* And this is actually equivalent to having done the 2nd
15502                      * pass now, so set it to not try again */
15503                     possible_end = (char *) -1;
15504                 }
15505             }
15506             else {
15507                 if (*p == ':') {
15508                     has_terminating_colon = TRUE;
15509                 }
15510                 else if (*p == ';') {
15511                     has_semi_colon = TRUE;
15512                     has_terminating_colon = TRUE;
15513                 }
15514                 p++;
15515             }
15516         }
15517
15518     try_posix:
15519
15520         /* Here, we have a class name to look up.  We can short circuit the
15521          * stuff below for short names that can't possibly be meant to be a
15522          * class name.  (We can do this on the first pass, as any second pass
15523          * will yield an even shorter name) */
15524         if (name_len < 3) {
15525             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15526         }
15527
15528         /* Find which class it is.  Initially switch on the length of the name.
15529          * */
15530         switch (name_len) {
15531             case 4:
15532                 if (memEQs(name_start, 4, "word")) {
15533                     /* this is not POSIX, this is the Perl \w */
15534                     class_number = ANYOF_WORDCHAR;
15535                 }
15536                 break;
15537             case 5:
15538                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15539                  *                        graph lower print punct space upper
15540                  * Offset 4 gives the best switch position.  */
15541                 switch (name_start[4]) {
15542                     case 'a':
15543                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
15544                             class_number = ANYOF_ALPHA;
15545                         break;
15546                     case 'e':
15547                         if (memBEGINs(name_start, 5, "spac")) /* space */
15548                             class_number = ANYOF_SPACE;
15549                         break;
15550                     case 'h':
15551                         if (memBEGINs(name_start, 5, "grap")) /* graph */
15552                             class_number = ANYOF_GRAPH;
15553                         break;
15554                     case 'i':
15555                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
15556                             class_number = ANYOF_ASCII;
15557                         break;
15558                     case 'k':
15559                         if (memBEGINs(name_start, 5, "blan")) /* blank */
15560                             class_number = ANYOF_BLANK;
15561                         break;
15562                     case 'l':
15563                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15564                             class_number = ANYOF_CNTRL;
15565                         break;
15566                     case 'm':
15567                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15568                             class_number = ANYOF_ALPHANUMERIC;
15569                         break;
15570                     case 'r':
15571                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
15572                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15573                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15574                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15575                         break;
15576                     case 't':
15577                         if (memBEGINs(name_start, 5, "digi")) /* digit */
15578                             class_number = ANYOF_DIGIT;
15579                         else if (memBEGINs(name_start, 5, "prin")) /* print */
15580                             class_number = ANYOF_PRINT;
15581                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
15582                             class_number = ANYOF_PUNCT;
15583                         break;
15584                 }
15585                 break;
15586             case 6:
15587                 if (memEQs(name_start, 6, "xdigit"))
15588                     class_number = ANYOF_XDIGIT;
15589                 break;
15590         }
15591
15592         /* If the name exactly matches a posix class name the class number will
15593          * here be set to it, and the input almost certainly was meant to be a
15594          * posix class, so we can skip further checking.  If instead the syntax
15595          * is exactly correct, but the name isn't one of the legal ones, we
15596          * will return that as an error below.  But if neither of these apply,
15597          * it could be that no posix class was intended at all, or that one
15598          * was, but there was a typo.  We tease these apart by doing fuzzy
15599          * matching on the name */
15600         if (class_number == OOB_NAMEDCLASS && found_problem) {
15601             const UV posix_names[][6] = {
15602                                                 { 'a', 'l', 'n', 'u', 'm' },
15603                                                 { 'a', 'l', 'p', 'h', 'a' },
15604                                                 { 'a', 's', 'c', 'i', 'i' },
15605                                                 { 'b', 'l', 'a', 'n', 'k' },
15606                                                 { 'c', 'n', 't', 'r', 'l' },
15607                                                 { 'd', 'i', 'g', 'i', 't' },
15608                                                 { 'g', 'r', 'a', 'p', 'h' },
15609                                                 { 'l', 'o', 'w', 'e', 'r' },
15610                                                 { 'p', 'r', 'i', 'n', 't' },
15611                                                 { 'p', 'u', 'n', 'c', 't' },
15612                                                 { 's', 'p', 'a', 'c', 'e' },
15613                                                 { 'u', 'p', 'p', 'e', 'r' },
15614                                                 { 'w', 'o', 'r', 'd' },
15615                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
15616                                             };
15617             /* The names of the above all have added NULs to make them the same
15618              * size, so we need to also have the real lengths */
15619             const UV posix_name_lengths[] = {
15620                                                 sizeof("alnum") - 1,
15621                                                 sizeof("alpha") - 1,
15622                                                 sizeof("ascii") - 1,
15623                                                 sizeof("blank") - 1,
15624                                                 sizeof("cntrl") - 1,
15625                                                 sizeof("digit") - 1,
15626                                                 sizeof("graph") - 1,
15627                                                 sizeof("lower") - 1,
15628                                                 sizeof("print") - 1,
15629                                                 sizeof("punct") - 1,
15630                                                 sizeof("space") - 1,
15631                                                 sizeof("upper") - 1,
15632                                                 sizeof("word")  - 1,
15633                                                 sizeof("xdigit")- 1
15634                                             };
15635             unsigned int i;
15636             int temp_max = max_distance;    /* Use a temporary, so if we
15637                                                reparse, we haven't changed the
15638                                                outer one */
15639
15640             /* Use a smaller max edit distance if we are missing one of the
15641              * delimiters */
15642             if (   has_opening_bracket + has_opening_colon < 2
15643                 || has_terminating_bracket + has_terminating_colon < 2)
15644             {
15645                 temp_max--;
15646             }
15647
15648             /* See if the input name is close to a legal one */
15649             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15650
15651                 /* Short circuit call if the lengths are too far apart to be
15652                  * able to match */
15653                 if (abs( (int) (name_len - posix_name_lengths[i]))
15654                     > temp_max)
15655                 {
15656                     continue;
15657                 }
15658
15659                 if (edit_distance(input_text,
15660                                   posix_names[i],
15661                                   name_len,
15662                                   posix_name_lengths[i],
15663                                   temp_max
15664                                  )
15665                     > -1)
15666                 { /* If it is close, it probably was intended to be a class */
15667                     goto probably_meant_to_be;
15668                 }
15669             }
15670
15671             /* Here the input name is not close enough to a valid class name
15672              * for us to consider it to be intended to be a posix class.  If
15673              * we haven't already done so, and the parse found a character that
15674              * could have been terminators for the name, but which we absorbed
15675              * as typos during the first pass, repeat the parse, signalling it
15676              * to stop at that character */
15677             if (possible_end && possible_end != (char *) -1) {
15678                 possible_end = (char *) -1;
15679                 p = name_start;
15680                 goto parse_name;
15681             }
15682
15683             /* Here neither pass found a close-enough class name */
15684             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15685         }
15686
15687     probably_meant_to_be:
15688
15689         /* Here we think that a posix specification was intended.  Update any
15690          * parse pointer */
15691         if (updated_parse_ptr) {
15692             *updated_parse_ptr = (char *) p;
15693         }
15694
15695         /* If a posix class name was intended but incorrectly specified, we
15696          * output or return the warnings */
15697         if (found_problem) {
15698
15699             /* We set flags for these issues in the parse loop above instead of
15700              * adding them to the list of warnings, because we can parse it
15701              * twice, and we only want one warning instance */
15702             if (has_upper) {
15703                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15704             }
15705             if (has_blank) {
15706                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15707             }
15708             if (has_semi_colon) {
15709                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15710             }
15711             else if (! has_terminating_colon) {
15712                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15713             }
15714             if (! has_terminating_bracket) {
15715                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15716             }
15717
15718             if (   posix_warnings
15719                 && RExC_warn_text
15720                 && av_top_index(RExC_warn_text) > -1)
15721             {
15722                 *posix_warnings = RExC_warn_text;
15723             }
15724         }
15725         else if (class_number != OOB_NAMEDCLASS) {
15726             /* If it is a known class, return the class.  The class number
15727              * #defines are structured so each complement is +1 to the normal
15728              * one */
15729             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15730         }
15731         else if (! check_only) {
15732
15733             /* Here, it is an unrecognized class.  This is an error (unless the
15734             * call is to check only, which we've already handled above) */
15735             const char * const complement_string = (complement)
15736                                                    ? "^"
15737                                                    : "";
15738             RExC_parse = (char *) p;
15739             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15740                         complement_string,
15741                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15742         }
15743     }
15744
15745     return OOB_NAMEDCLASS;
15746 }
15747 #undef ADD_POSIX_WARNING
15748
15749 STATIC unsigned  int
15750 S_regex_set_precedence(const U8 my_operator) {
15751
15752     /* Returns the precedence in the (?[...]) construct of the input operator,
15753      * specified by its character representation.  The precedence follows
15754      * general Perl rules, but it extends this so that ')' and ']' have (low)
15755      * precedence even though they aren't really operators */
15756
15757     switch (my_operator) {
15758         case '!':
15759             return 5;
15760         case '&':
15761             return 4;
15762         case '^':
15763         case '|':
15764         case '+':
15765         case '-':
15766             return 3;
15767         case ')':
15768             return 2;
15769         case ']':
15770             return 1;
15771     }
15772
15773     NOT_REACHED; /* NOTREACHED */
15774     return 0;   /* Silence compiler warning */
15775 }
15776
15777 STATIC regnode_offset
15778 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
15779                     I32 *flagp, U32 depth,
15780                     char * const oregcomp_parse)
15781 {
15782     /* Handle the (?[...]) construct to do set operations */
15783
15784     U8 curchar;                     /* Current character being parsed */
15785     UV start, end;                  /* End points of code point ranges */
15786     SV* final = NULL;               /* The end result inversion list */
15787     SV* result_string;              /* 'final' stringified */
15788     AV* stack;                      /* stack of operators and operands not yet
15789                                        resolved */
15790     AV* fence_stack = NULL;         /* A stack containing the positions in
15791                                        'stack' of where the undealt-with left
15792                                        parens would be if they were actually
15793                                        put there */
15794     /* The 'volatile' is a workaround for an optimiser bug
15795      * in Solaris Studio 12.3. See RT #127455 */
15796     volatile IV fence = 0;          /* Position of where most recent undealt-
15797                                        with left paren in stack is; -1 if none.
15798                                      */
15799     STRLEN len;                     /* Temporary */
15800     regnode_offset node;                  /* Temporary, and final regnode returned by
15801                                        this function */
15802     const bool save_fold = FOLD;    /* Temporary */
15803     char *save_end, *save_parse;    /* Temporaries */
15804     const bool in_locale = LOC;     /* we turn off /l during processing */
15805
15806     GET_RE_DEBUG_FLAGS_DECL;
15807
15808     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
15809
15810     DEBUG_PARSE("xcls");
15811
15812     if (in_locale) {
15813         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
15814     }
15815
15816     /* The use of this operator implies /u.  This is required so that the
15817      * compile time values are valid in all runtime cases */
15818     REQUIRE_UNI_RULES(flagp, 0);
15819
15820     ckWARNexperimental(RExC_parse,
15821                        WARN_EXPERIMENTAL__REGEX_SETS,
15822                        "The regex_sets feature is experimental");
15823
15824     /* Everything in this construct is a metacharacter.  Operands begin with
15825      * either a '\' (for an escape sequence), or a '[' for a bracketed
15826      * character class.  Any other character should be an operator, or
15827      * parenthesis for grouping.  Both types of operands are handled by calling
15828      * regclass() to parse them.  It is called with a parameter to indicate to
15829      * return the computed inversion list.  The parsing here is implemented via
15830      * a stack.  Each entry on the stack is a single character representing one
15831      * of the operators; or else a pointer to an operand inversion list. */
15832
15833 #define IS_OPERATOR(a) SvIOK(a)
15834 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
15835
15836     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
15837      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15838      * with pronouncing it called it Reverse Polish instead, but now that YOU
15839      * know how to pronounce it you can use the correct term, thus giving due
15840      * credit to the person who invented it, and impressing your geek friends.
15841      * Wikipedia says that the pronounciation of "Ł" has been changing so that
15842      * it is now more like an English initial W (as in wonk) than an L.)
15843      *
15844      * This means that, for example, 'a | b & c' is stored on the stack as
15845      *
15846      * c  [4]
15847      * b  [3]
15848      * &  [2]
15849      * a  [1]
15850      * |  [0]
15851      *
15852      * where the numbers in brackets give the stack [array] element number.
15853      * In this implementation, parentheses are not stored on the stack.
15854      * Instead a '(' creates a "fence" so that the part of the stack below the
15855      * fence is invisible except to the corresponding ')' (this allows us to
15856      * replace testing for parens, by using instead subtraction of the fence
15857      * position).  As new operands are processed they are pushed onto the stack
15858      * (except as noted in the next paragraph).  New operators of higher
15859      * precedence than the current final one are inserted on the stack before
15860      * the lhs operand (so that when the rhs is pushed next, everything will be
15861      * in the correct positions shown above.  When an operator of equal or
15862      * lower precedence is encountered in parsing, all the stacked operations
15863      * of equal or higher precedence are evaluated, leaving the result as the
15864      * top entry on the stack.  This makes higher precedence operations
15865      * evaluate before lower precedence ones, and causes operations of equal
15866      * precedence to left associate.
15867      *
15868      * The only unary operator '!' is immediately pushed onto the stack when
15869      * encountered.  When an operand is encountered, if the top of the stack is
15870      * a '!", the complement is immediately performed, and the '!' popped.  The
15871      * resulting value is treated as a new operand, and the logic in the
15872      * previous paragraph is executed.  Thus in the expression
15873      *      [a] + ! [b]
15874      * the stack looks like
15875      *
15876      * !
15877      * a
15878      * +
15879      *
15880      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15881      * becomes
15882      *
15883      * !b
15884      * a
15885      * +
15886      *
15887      * A ')' is treated as an operator with lower precedence than all the
15888      * aforementioned ones, which causes all operations on the stack above the
15889      * corresponding '(' to be evaluated down to a single resultant operand.
15890      * Then the fence for the '(' is removed, and the operand goes through the
15891      * algorithm above, without the fence.
15892      *
15893      * A separate stack is kept of the fence positions, so that the position of
15894      * the latest so-far unbalanced '(' is at the top of it.
15895      *
15896      * The ']' ending the construct is treated as the lowest operator of all,
15897      * so that everything gets evaluated down to a single operand, which is the
15898      * result */
15899
15900     sv_2mortal((SV *)(stack = newAV()));
15901     sv_2mortal((SV *)(fence_stack = newAV()));
15902
15903     while (RExC_parse < RExC_end) {
15904         I32 top_index;              /* Index of top-most element in 'stack' */
15905         SV** top_ptr;               /* Pointer to top 'stack' element */
15906         SV* current = NULL;         /* To contain the current inversion list
15907                                        operand */
15908         SV* only_to_avoid_leaks;
15909
15910         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15911                                 TRUE /* Force /x */ );
15912         if (RExC_parse >= RExC_end) {   /* Fail */
15913             break;
15914         }
15915
15916         curchar = UCHARAT(RExC_parse);
15917
15918 redo_curchar:
15919
15920 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15921                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15922         DEBUG_U(dump_regex_sets_structures(pRExC_state,
15923                                            stack, fence, fence_stack));
15924 #endif
15925
15926         top_index = av_tindex_skip_len_mg(stack);
15927
15928         switch (curchar) {
15929             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
15930             char stacked_operator;  /* The topmost operator on the 'stack'. */
15931             SV* lhs;                /* Operand to the left of the operator */
15932             SV* rhs;                /* Operand to the right of the operator */
15933             SV* fence_ptr;          /* Pointer to top element of the fence
15934                                        stack */
15935
15936             case '(':
15937
15938                 if (   RExC_parse < RExC_end - 2
15939                     && UCHARAT(RExC_parse + 1) == '?'
15940                     && UCHARAT(RExC_parse + 2) == '^')
15941                 {
15942                     /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
15943                      * This happens when we have some thing like
15944                      *
15945                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15946                      *   ...
15947                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
15948                      *
15949                      * Here we would be handling the interpolated
15950                      * '$thai_or_lao'.  We handle this by a recursive call to
15951                      * ourselves which returns the inversion list the
15952                      * interpolated expression evaluates to.  We use the flags
15953                      * from the interpolated pattern. */
15954                     U32 save_flags = RExC_flags;
15955                     const char * save_parse;
15956
15957                     RExC_parse += 2;        /* Skip past the '(?' */
15958                     save_parse = RExC_parse;
15959
15960                     /* Parse the flags for the '(?'.  We already know the first
15961                      * flag to parse is a '^' */
15962                     parse_lparen_question_flags(pRExC_state);
15963
15964                     if (   RExC_parse >= RExC_end - 4
15965                         || UCHARAT(RExC_parse) != ':'
15966                         || UCHARAT(++RExC_parse) != '('
15967                         || UCHARAT(++RExC_parse) != '?'
15968                         || UCHARAT(++RExC_parse) != '[')
15969                     {
15970
15971                         /* In combination with the above, this moves the
15972                          * pointer to the point just after the first erroneous
15973                          * character. */
15974                         if (RExC_parse >= RExC_end - 4) {
15975                             RExC_parse = RExC_end;
15976                         }
15977                         else if (RExC_parse != save_parse) {
15978                             RExC_parse += (UTF)
15979                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
15980                                           : 1;
15981                         }
15982                         vFAIL("Expecting '(?flags:(?[...'");
15983                     }
15984
15985                     /* Recurse, with the meat of the embedded expression */
15986                     RExC_parse++;
15987                     (void) handle_regex_sets(pRExC_state, &current, flagp,
15988                                                     depth+1, oregcomp_parse);
15989
15990                     /* Here, 'current' contains the embedded expression's
15991                      * inversion list, and RExC_parse points to the trailing
15992                      * ']'; the next character should be the ')' */
15993                     RExC_parse++;
15994                     if (UCHARAT(RExC_parse) != ')')
15995                         vFAIL("Expecting close paren for nested extended charclass");
15996
15997                     /* Then the ')' matching the original '(' handled by this
15998                      * case: statement */
15999                     RExC_parse++;
16000                     if (UCHARAT(RExC_parse) != ')')
16001                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
16002
16003                     RExC_flags = save_flags;
16004                     goto handle_operand;
16005                 }
16006
16007                 /* A regular '('.  Look behind for illegal syntax */
16008                 if (top_index - fence >= 0) {
16009                     /* If the top entry on the stack is an operator, it had
16010                      * better be a '!', otherwise the entry below the top
16011                      * operand should be an operator */
16012                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16013                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16014                         || (   IS_OPERAND(*top_ptr)
16015                             && (   top_index - fence < 1
16016                                 || ! (stacked_ptr = av_fetch(stack,
16017                                                              top_index - 1,
16018                                                              FALSE))
16019                                 || ! IS_OPERATOR(*stacked_ptr))))
16020                     {
16021                         RExC_parse++;
16022                         vFAIL("Unexpected '(' with no preceding operator");
16023                     }
16024                 }
16025
16026                 /* Stack the position of this undealt-with left paren */
16027                 av_push(fence_stack, newSViv(fence));
16028                 fence = top_index + 1;
16029                 break;
16030
16031             case '\\':
16032                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16033                  * multi-char folds are allowed.  */
16034                 if (!regclass(pRExC_state, flagp, depth+1,
16035                               TRUE, /* means parse just the next thing */
16036                               FALSE, /* don't allow multi-char folds */
16037                               FALSE, /* don't silence non-portable warnings.  */
16038                               TRUE,  /* strict */
16039                               FALSE, /* Require return to be an ANYOF */
16040                               &current))
16041                 {
16042                     goto regclass_failed;
16043                 }
16044
16045                 /* regclass() will return with parsing just the \ sequence,
16046                  * leaving the parse pointer at the next thing to parse */
16047                 RExC_parse--;
16048                 goto handle_operand;
16049
16050             case '[':   /* Is a bracketed character class */
16051             {
16052                 /* See if this is a [:posix:] class. */
16053                 bool is_posix_class = (OOB_NAMEDCLASS
16054                             < handle_possible_posix(pRExC_state,
16055                                                 RExC_parse + 1,
16056                                                 NULL,
16057                                                 NULL,
16058                                                 TRUE /* checking only */));
16059                 /* If it is a posix class, leave the parse pointer at the '['
16060                  * to fool regclass() into thinking it is part of a
16061                  * '[[:posix:]]'. */
16062                 if (! is_posix_class) {
16063                     RExC_parse++;
16064                 }
16065
16066                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16067                  * multi-char folds are allowed.  */
16068                 if (!regclass(pRExC_state, flagp, depth+1,
16069                                 is_posix_class, /* parse the whole char
16070                                                     class only if not a
16071                                                     posix class */
16072                                 FALSE, /* don't allow multi-char folds */
16073                                 TRUE, /* silence non-portable warnings. */
16074                                 TRUE, /* strict */
16075                                 FALSE, /* Require return to be an ANYOF */
16076                                 &current))
16077                 {
16078                     goto regclass_failed;
16079                 }
16080
16081                 if (! current) {
16082                     break;
16083                 }
16084
16085                 /* function call leaves parse pointing to the ']', except if we
16086                  * faked it */
16087                 if (is_posix_class) {
16088                     RExC_parse--;
16089                 }
16090
16091                 goto handle_operand;
16092             }
16093
16094             case ']':
16095                 if (top_index >= 1) {
16096                     goto join_operators;
16097                 }
16098
16099                 /* Only a single operand on the stack: are done */
16100                 goto done;
16101
16102             case ')':
16103                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16104                     if (UCHARAT(RExC_parse - 1) == ']')  {
16105                         break;
16106                     }
16107                     RExC_parse++;
16108                     vFAIL("Unexpected ')'");
16109                 }
16110
16111                 /* If nothing after the fence, is missing an operand */
16112                 if (top_index - fence < 0) {
16113                     RExC_parse++;
16114                     goto bad_syntax;
16115                 }
16116                 /* If at least two things on the stack, treat this as an
16117                   * operator */
16118                 if (top_index - fence >= 1) {
16119                     goto join_operators;
16120                 }
16121
16122                 /* Here only a single thing on the fenced stack, and there is a
16123                  * fence.  Get rid of it */
16124                 fence_ptr = av_pop(fence_stack);
16125                 assert(fence_ptr);
16126                 fence = SvIV(fence_ptr);
16127                 SvREFCNT_dec_NN(fence_ptr);
16128                 fence_ptr = NULL;
16129
16130                 if (fence < 0) {
16131                     fence = 0;
16132                 }
16133
16134                 /* Having gotten rid of the fence, we pop the operand at the
16135                  * stack top and process it as a newly encountered operand */
16136                 current = av_pop(stack);
16137                 if (IS_OPERAND(current)) {
16138                     goto handle_operand;
16139                 }
16140
16141                 RExC_parse++;
16142                 goto bad_syntax;
16143
16144             case '&':
16145             case '|':
16146             case '+':
16147             case '-':
16148             case '^':
16149
16150                 /* These binary operators should have a left operand already
16151                  * parsed */
16152                 if (   top_index - fence < 0
16153                     || top_index - fence == 1
16154                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16155                     || ! IS_OPERAND(*top_ptr))
16156                 {
16157                     goto unexpected_binary;
16158                 }
16159
16160                 /* If only the one operand is on the part of the stack visible
16161                  * to us, we just place this operator in the proper position */
16162                 if (top_index - fence < 2) {
16163
16164                     /* Place the operator before the operand */
16165
16166                     SV* lhs = av_pop(stack);
16167                     av_push(stack, newSVuv(curchar));
16168                     av_push(stack, lhs);
16169                     break;
16170                 }
16171
16172                 /* But if there is something else on the stack, we need to
16173                  * process it before this new operator if and only if the
16174                  * stacked operation has equal or higher precedence than the
16175                  * new one */
16176
16177              join_operators:
16178
16179                 /* The operator on the stack is supposed to be below both its
16180                  * operands */
16181                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16182                     || IS_OPERAND(*stacked_ptr))
16183                 {
16184                     /* But if not, it's legal and indicates we are completely
16185                      * done if and only if we're currently processing a ']',
16186                      * which should be the final thing in the expression */
16187                     if (curchar == ']') {
16188                         goto done;
16189                     }
16190
16191                   unexpected_binary:
16192                     RExC_parse++;
16193                     vFAIL2("Unexpected binary operator '%c' with no "
16194                            "preceding operand", curchar);
16195                 }
16196                 stacked_operator = (char) SvUV(*stacked_ptr);
16197
16198                 if (regex_set_precedence(curchar)
16199                     > regex_set_precedence(stacked_operator))
16200                 {
16201                     /* Here, the new operator has higher precedence than the
16202                      * stacked one.  This means we need to add the new one to
16203                      * the stack to await its rhs operand (and maybe more
16204                      * stuff).  We put it before the lhs operand, leaving
16205                      * untouched the stacked operator and everything below it
16206                      * */
16207                     lhs = av_pop(stack);
16208                     assert(IS_OPERAND(lhs));
16209
16210                     av_push(stack, newSVuv(curchar));
16211                     av_push(stack, lhs);
16212                     break;
16213                 }
16214
16215                 /* Here, the new operator has equal or lower precedence than
16216                  * what's already there.  This means the operation already
16217                  * there should be performed now, before the new one. */
16218
16219                 rhs = av_pop(stack);
16220                 if (! IS_OPERAND(rhs)) {
16221
16222                     /* This can happen when a ! is not followed by an operand,
16223                      * like in /(?[\t &!])/ */
16224                     goto bad_syntax;
16225                 }
16226
16227                 lhs = av_pop(stack);
16228
16229                 if (! IS_OPERAND(lhs)) {
16230
16231                     /* This can happen when there is an empty (), like in
16232                      * /(?[[0]+()+])/ */
16233                     goto bad_syntax;
16234                 }
16235
16236                 switch (stacked_operator) {
16237                     case '&':
16238                         _invlist_intersection(lhs, rhs, &rhs);
16239                         break;
16240
16241                     case '|':
16242                     case '+':
16243                         _invlist_union(lhs, rhs, &rhs);
16244                         break;
16245
16246                     case '-':
16247                         _invlist_subtract(lhs, rhs, &rhs);
16248                         break;
16249
16250                     case '^':   /* The union minus the intersection */
16251                     {
16252                         SV* i = NULL;
16253                         SV* u = NULL;
16254
16255                         _invlist_union(lhs, rhs, &u);
16256                         _invlist_intersection(lhs, rhs, &i);
16257                         _invlist_subtract(u, i, &rhs);
16258                         SvREFCNT_dec_NN(i);
16259                         SvREFCNT_dec_NN(u);
16260                         break;
16261                     }
16262                 }
16263                 SvREFCNT_dec(lhs);
16264
16265                 /* Here, the higher precedence operation has been done, and the
16266                  * result is in 'rhs'.  We overwrite the stacked operator with
16267                  * the result.  Then we redo this code to either push the new
16268                  * operator onto the stack or perform any higher precedence
16269                  * stacked operation */
16270                 only_to_avoid_leaks = av_pop(stack);
16271                 SvREFCNT_dec(only_to_avoid_leaks);
16272                 av_push(stack, rhs);
16273                 goto redo_curchar;
16274
16275             case '!':   /* Highest priority, right associative */
16276
16277                 /* If what's already at the top of the stack is another '!",
16278                  * they just cancel each other out */
16279                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16280                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16281                 {
16282                     only_to_avoid_leaks = av_pop(stack);
16283                     SvREFCNT_dec(only_to_avoid_leaks);
16284                 }
16285                 else { /* Otherwise, since it's right associative, just push
16286                           onto the stack */
16287                     av_push(stack, newSVuv(curchar));
16288                 }
16289                 break;
16290
16291             default:
16292                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16293                 if (RExC_parse >= RExC_end) {
16294                     break;
16295                 }
16296                 vFAIL("Unexpected character");
16297
16298           handle_operand:
16299
16300             /* Here 'current' is the operand.  If something is already on the
16301              * stack, we have to check if it is a !.  But first, the code above
16302              * may have altered the stack in the time since we earlier set
16303              * 'top_index'.  */
16304
16305             top_index = av_tindex_skip_len_mg(stack);
16306             if (top_index - fence >= 0) {
16307                 /* If the top entry on the stack is an operator, it had better
16308                  * be a '!', otherwise the entry below the top operand should
16309                  * be an operator */
16310                 top_ptr = av_fetch(stack, top_index, FALSE);
16311                 assert(top_ptr);
16312                 if (IS_OPERATOR(*top_ptr)) {
16313
16314                     /* The only permissible operator at the top of the stack is
16315                      * '!', which is applied immediately to this operand. */
16316                     curchar = (char) SvUV(*top_ptr);
16317                     if (curchar != '!') {
16318                         SvREFCNT_dec(current);
16319                         vFAIL2("Unexpected binary operator '%c' with no "
16320                                 "preceding operand", curchar);
16321                     }
16322
16323                     _invlist_invert(current);
16324
16325                     only_to_avoid_leaks = av_pop(stack);
16326                     SvREFCNT_dec(only_to_avoid_leaks);
16327
16328                     /* And we redo with the inverted operand.  This allows
16329                      * handling multiple ! in a row */
16330                     goto handle_operand;
16331                 }
16332                           /* Single operand is ok only for the non-binary ')'
16333                            * operator */
16334                 else if ((top_index - fence == 0 && curchar != ')')
16335                          || (top_index - fence > 0
16336                              && (! (stacked_ptr = av_fetch(stack,
16337                                                            top_index - 1,
16338                                                            FALSE))
16339                                  || IS_OPERAND(*stacked_ptr))))
16340                 {
16341                     SvREFCNT_dec(current);
16342                     vFAIL("Operand with no preceding operator");
16343                 }
16344             }
16345
16346             /* Here there was nothing on the stack or the top element was
16347              * another operand.  Just add this new one */
16348             av_push(stack, current);
16349
16350         } /* End of switch on next parse token */
16351
16352         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16353     } /* End of loop parsing through the construct */
16354
16355     vFAIL("Syntax error in (?[...])");
16356
16357   done:
16358
16359     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16360         if (RExC_parse < RExC_end) {
16361             RExC_parse++;
16362         }
16363
16364         vFAIL("Unexpected ']' with no following ')' in (?[...");
16365     }
16366
16367     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16368         vFAIL("Unmatched (");
16369     }
16370
16371     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16372         || ((final = av_pop(stack)) == NULL)
16373         || ! IS_OPERAND(final)
16374         || ! is_invlist(final)
16375         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16376     {
16377       bad_syntax:
16378         SvREFCNT_dec(final);
16379         vFAIL("Incomplete expression within '(?[ ])'");
16380     }
16381
16382     /* Here, 'final' is the resultant inversion list from evaluating the
16383      * expression.  Return it if so requested */
16384     if (return_invlist) {
16385         *return_invlist = final;
16386         return END;
16387     }
16388
16389     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
16390      * expecting a string of ranges and individual code points */
16391     invlist_iterinit(final);
16392     result_string = newSVpvs("");
16393     while (invlist_iternext(final, &start, &end)) {
16394         if (start == end) {
16395             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16396         }
16397         else {
16398             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16399                                                      start,          end);
16400         }
16401     }
16402
16403     /* About to generate an ANYOF (or similar) node from the inversion list we
16404      * have calculated */
16405     save_parse = RExC_parse;
16406     RExC_parse = SvPV(result_string, len);
16407     save_end = RExC_end;
16408     RExC_end = RExC_parse + len;
16409     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16410
16411     /* We turn off folding around the call, as the class we have constructed
16412      * already has all folding taken into consideration, and we don't want
16413      * regclass() to add to that */
16414     RExC_flags &= ~RXf_PMf_FOLD;
16415     /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16416      * folds are allowed.  */
16417     node = regclass(pRExC_state, flagp, depth+1,
16418                     FALSE, /* means parse the whole char class */
16419                     FALSE, /* don't allow multi-char folds */
16420                     TRUE, /* silence non-portable warnings.  The above may very
16421                              well have generated non-portable code points, but
16422                              they're valid on this machine */
16423                     FALSE, /* similarly, no need for strict */
16424                     FALSE, /* Require return to be an ANYOF */
16425                     NULL
16426                 );
16427
16428     RESTORE_WARNINGS;
16429     RExC_parse = save_parse + 1;
16430     RExC_end = save_end;
16431     SvREFCNT_dec_NN(final);
16432     SvREFCNT_dec_NN(result_string);
16433
16434     if (save_fold) {
16435         RExC_flags |= RXf_PMf_FOLD;
16436     }
16437
16438     if (!node)
16439         goto regclass_failed;
16440
16441     /* Fix up the node type if we are in locale.  (We have pretended we are
16442      * under /u for the purposes of regclass(), as this construct will only
16443      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
16444      * as to cause any warnings about bad locales to be output in regexec.c),
16445      * and add the flag that indicates to check if not in a UTF-8 locale.  The
16446      * reason we above forbid optimization into something other than an ANYOF
16447      * node is simply to minimize the number of code changes in regexec.c.
16448      * Otherwise we would have to create new EXACTish node types and deal with
16449      * them.  This decision could be revisited should this construct become
16450      * popular.
16451      *
16452      * (One might think we could look at the resulting ANYOF node and suppress
16453      * the flag if everything is above 255, as those would be UTF-8 only,
16454      * but this isn't true, as the components that led to that result could
16455      * have been locale-affected, and just happen to cancel each other out
16456      * under UTF-8 locales.) */
16457     if (in_locale) {
16458         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16459
16460         assert(OP(REGNODE_p(node)) == ANYOF);
16461
16462         OP(REGNODE_p(node)) = ANYOFL;
16463         ANYOF_FLAGS(REGNODE_p(node))
16464                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16465     }
16466
16467     nextchar(pRExC_state);
16468     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16469     return node;
16470
16471   regclass_failed:
16472     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16473                                                                 (UV) *flagp);
16474 }
16475
16476 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16477
16478 STATIC void
16479 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16480                              AV * stack, const IV fence, AV * fence_stack)
16481 {   /* Dumps the stacks in handle_regex_sets() */
16482
16483     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16484     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16485     SSize_t i;
16486
16487     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16488
16489     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16490
16491     if (stack_top < 0) {
16492         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16493     }
16494     else {
16495         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16496         for (i = stack_top; i >= 0; i--) {
16497             SV ** element_ptr = av_fetch(stack, i, FALSE);
16498             if (! element_ptr) {
16499             }
16500
16501             if (IS_OPERATOR(*element_ptr)) {
16502                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16503                                             (int) i, (int) SvIV(*element_ptr));
16504             }
16505             else {
16506                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16507                 sv_dump(*element_ptr);
16508             }
16509         }
16510     }
16511
16512     if (fence_stack_top < 0) {
16513         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16514     }
16515     else {
16516         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16517         for (i = fence_stack_top; i >= 0; i--) {
16518             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16519             if (! element_ptr) {
16520             }
16521
16522             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16523                                             (int) i, (int) SvIV(*element_ptr));
16524         }
16525     }
16526 }
16527
16528 #endif
16529
16530 #undef IS_OPERATOR
16531 #undef IS_OPERAND
16532
16533 STATIC void
16534 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16535 {
16536     /* This adds the Latin1/above-Latin1 folding rules.
16537      *
16538      * This should be called only for a Latin1-range code points, cp, which is
16539      * known to be involved in a simple fold with other code points above
16540      * Latin1.  It would give false results if /aa has been specified.
16541      * Multi-char folds are outside the scope of this, and must be handled
16542      * specially. */
16543
16544     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16545
16546     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16547
16548     /* The rules that are valid for all Unicode versions are hard-coded in */
16549     switch (cp) {
16550         case 'k':
16551         case 'K':
16552           *invlist =
16553              add_cp_to_invlist(*invlist, KELVIN_SIGN);
16554             break;
16555         case 's':
16556         case 'S':
16557           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16558             break;
16559         case MICRO_SIGN:
16560           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16561           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16562             break;
16563         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16564         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16565           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16566             break;
16567         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16568           *invlist = add_cp_to_invlist(*invlist,
16569                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16570             break;
16571
16572         default:    /* Other code points are checked against the data for the
16573                        current Unicode version */
16574           {
16575             Size_t folds_count;
16576             unsigned int first_fold;
16577             const unsigned int * remaining_folds;
16578             UV folded_cp;
16579
16580             if (isASCII(cp)) {
16581                 folded_cp = toFOLD(cp);
16582             }
16583             else {
16584                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16585                 Size_t dummy_len;
16586                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16587             }
16588
16589             if (folded_cp > 255) {
16590                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16591             }
16592
16593             folds_count = _inverse_folds(folded_cp, &first_fold,
16594                                                     &remaining_folds);
16595             if (folds_count == 0) {
16596
16597                 /* Use deprecated warning to increase the chances of this being
16598                  * output */
16599                 ckWARN2reg_d(RExC_parse,
16600                         "Perl folding rules are not up-to-date for 0x%02X;"
16601                         " please use the perlbug utility to report;", cp);
16602             }
16603             else {
16604                 unsigned int i;
16605
16606                 if (first_fold > 255) {
16607                     *invlist = add_cp_to_invlist(*invlist, first_fold);
16608                 }
16609                 for (i = 0; i < folds_count - 1; i++) {
16610                     if (remaining_folds[i] > 255) {
16611                         *invlist = add_cp_to_invlist(*invlist,
16612                                                     remaining_folds[i]);
16613                     }
16614                 }
16615             }
16616             break;
16617          }
16618     }
16619 }
16620
16621 STATIC void
16622 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16623 {
16624     /* Output the elements of the array given by '*posix_warnings' as REGEXP
16625      * warnings. */
16626
16627     SV * msg;
16628     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16629
16630     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16631
16632     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16633         return;
16634     }
16635
16636     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16637         if (first_is_fatal) {           /* Avoid leaking this */
16638             av_undef(posix_warnings);   /* This isn't necessary if the
16639                                             array is mortal, but is a
16640                                             fail-safe */
16641             (void) sv_2mortal(msg);
16642             PREPARE_TO_DIE;
16643         }
16644         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16645         SvREFCNT_dec_NN(msg);
16646     }
16647
16648     UPDATE_WARNINGS_LOC(RExC_parse);
16649 }
16650
16651 STATIC AV *
16652 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16653 {
16654     /* This adds the string scalar <multi_string> to the array
16655      * <multi_char_matches>.  <multi_string> is known to have exactly
16656      * <cp_count> code points in it.  This is used when constructing a
16657      * bracketed character class and we find something that needs to match more
16658      * than a single character.
16659      *
16660      * <multi_char_matches> is actually an array of arrays.  Each top-level
16661      * element is an array that contains all the strings known so far that are
16662      * the same length.  And that length (in number of code points) is the same
16663      * as the index of the top-level array.  Hence, the [2] element is an
16664      * array, each element thereof is a string containing TWO code points;
16665      * while element [3] is for strings of THREE characters, and so on.  Since
16666      * this is for multi-char strings there can never be a [0] nor [1] element.
16667      *
16668      * When we rewrite the character class below, we will do so such that the
16669      * longest strings are written first, so that it prefers the longest
16670      * matching strings first.  This is done even if it turns out that any
16671      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
16672      * Christiansen has agreed that this is ok.  This makes the test for the
16673      * ligature 'ffi' come before the test for 'ff', for example */
16674
16675     AV* this_array;
16676     AV** this_array_ptr;
16677
16678     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16679
16680     if (! multi_char_matches) {
16681         multi_char_matches = newAV();
16682     }
16683
16684     if (av_exists(multi_char_matches, cp_count)) {
16685         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16686         this_array = *this_array_ptr;
16687     }
16688     else {
16689         this_array = newAV();
16690         av_store(multi_char_matches, cp_count,
16691                  (SV*) this_array);
16692     }
16693     av_push(this_array, multi_string);
16694
16695     return multi_char_matches;
16696 }
16697
16698 /* The names of properties whose definitions are not known at compile time are
16699  * stored in this SV, after a constant heading.  So if the length has been
16700  * changed since initialization, then there is a run-time definition. */
16701 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
16702                                         (SvCUR(listsv) != initial_listsv_len)
16703
16704 /* There is a restricted set of white space characters that are legal when
16705  * ignoring white space in a bracketed character class.  This generates the
16706  * code to skip them.
16707  *
16708  * There is a line below that uses the same white space criteria but is outside
16709  * this macro.  Both here and there must use the same definition */
16710 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
16711     STMT_START {                                                        \
16712         if (do_skip) {                                                  \
16713             while (isBLANK_A(UCHARAT(p)))                               \
16714             {                                                           \
16715                 p++;                                                    \
16716             }                                                           \
16717         }                                                               \
16718     } STMT_END
16719
16720 STATIC regnode_offset
16721 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
16722                  const bool stop_at_1,  /* Just parse the next thing, don't
16723                                            look for a full character class */
16724                  bool allow_mutiple_chars,
16725                  const bool silence_non_portable,   /* Don't output warnings
16726                                                        about too large
16727                                                        characters */
16728                  const bool strict,
16729                  bool optimizable,                  /* ? Allow a non-ANYOF return
16730                                                        node */
16731                  SV** ret_invlist  /* Return an inversion list, not a node */
16732           )
16733 {
16734     /* parse a bracketed class specification.  Most of these will produce an
16735      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16736      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
16737      * under /i with multi-character folds: it will be rewritten following the
16738      * paradigm of this example, where the <multi-fold>s are characters which
16739      * fold to multiple character sequences:
16740      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16741      * gets effectively rewritten as:
16742      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16743      * reg() gets called (recursively) on the rewritten version, and this
16744      * function will return what it constructs.  (Actually the <multi-fold>s
16745      * aren't physically removed from the [abcdefghi], it's just that they are
16746      * ignored in the recursion by means of a flag:
16747      * <RExC_in_multi_char_class>.)
16748      *
16749      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16750      * characters, with the corresponding bit set if that character is in the
16751      * list.  For characters above this, an inversion list is used.  There
16752      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16753      * determinable at compile time
16754      *
16755      * On success, returns the offset at which any next node should be placed
16756      * into the regex engine program being compiled.
16757      *
16758      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
16759      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
16760      * UTF-8
16761      */
16762
16763     dVAR;
16764     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16765     IV range = 0;
16766     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16767     regnode_offset ret = -1;    /* Initialized to an illegal value */
16768     STRLEN numlen;
16769     int namedclass = OOB_NAMEDCLASS;
16770     char *rangebegin = NULL;
16771     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
16772                                aren't available at the time this was called */
16773     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16774                                       than just initialized.  */
16775     SV* properties = NULL;    /* Code points that match \p{} \P{} */
16776     SV* posixes = NULL;     /* Code points that match classes like [:word:],
16777                                extended beyond the Latin1 range.  These have to
16778                                be kept separate from other code points for much
16779                                of this function because their handling  is
16780                                different under /i, and for most classes under
16781                                /d as well */
16782     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
16783                                separate for a while from the non-complemented
16784                                versions because of complications with /d
16785                                matching */
16786     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16787                                   treated more simply than the general case,
16788                                   leading to less compilation and execution
16789                                   work */
16790     UV element_count = 0;   /* Number of distinct elements in the class.
16791                                Optimizations may be possible if this is tiny */
16792     AV * multi_char_matches = NULL; /* Code points that fold to more than one
16793                                        character; used under /i */
16794     UV n;
16795     char * stop_ptr = RExC_end;    /* where to stop parsing */
16796
16797     /* ignore unescaped whitespace? */
16798     const bool skip_white = cBOOL(   ret_invlist
16799                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16800
16801     /* inversion list of code points this node matches only when the target
16802      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
16803      * /d) */
16804     SV* upper_latin1_only_utf8_matches = NULL;
16805
16806     /* Inversion list of code points this node matches regardless of things
16807      * like locale, folding, utf8ness of the target string */
16808     SV* cp_list = NULL;
16809
16810     /* Like cp_list, but code points on this list need to be checked for things
16811      * that fold to/from them under /i */
16812     SV* cp_foldable_list = NULL;
16813
16814     /* Like cp_list, but code points on this list are valid only when the
16815      * runtime locale is UTF-8 */
16816     SV* only_utf8_locale_list = NULL;
16817
16818     /* In a range, if one of the endpoints is non-character-set portable,
16819      * meaning that it hard-codes a code point that may mean a different
16820      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16821      * mnemonic '\t' which each mean the same character no matter which
16822      * character set the platform is on. */
16823     unsigned int non_portable_endpoint = 0;
16824
16825     /* Is the range unicode? which means on a platform that isn't 1-1 native
16826      * to Unicode (i.e. non-ASCII), each code point in it should be considered
16827      * to be a Unicode value.  */
16828     bool unicode_range = FALSE;
16829     bool invert = FALSE;    /* Is this class to be complemented */
16830
16831     bool warn_super = ALWAYS_WARN_SUPER;
16832
16833     const char * orig_parse = RExC_parse;
16834
16835     /* This variable is used to mark where the end in the input is of something
16836      * that looks like a POSIX construct but isn't.  During the parse, when
16837      * something looks like it could be such a construct is encountered, it is
16838      * checked for being one, but not if we've already checked this area of the
16839      * input.  Only after this position is reached do we check again */
16840     char *not_posix_region_end = RExC_parse - 1;
16841
16842     AV* posix_warnings = NULL;
16843     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
16844     U8 op = END;    /* The returned node-type, initialized to an impossible
16845                        one.  */
16846     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
16847     U32 posixl = 0;       /* bit field of posix classes matched under /l */
16848
16849
16850 /* Flags as to what things aren't knowable until runtime.  (Note that these are
16851  * mutually exclusive.) */
16852 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
16853                                             haven't been defined as of yet */
16854 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
16855                                             UTF-8 or not */
16856 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
16857                                             what gets folded */
16858     U32 has_runtime_dependency = 0;     /* OR of the above flags */
16859
16860     GET_RE_DEBUG_FLAGS_DECL;
16861
16862     PERL_ARGS_ASSERT_REGCLASS;
16863 #ifndef DEBUGGING
16864     PERL_UNUSED_ARG(depth);
16865 #endif
16866
16867
16868     /* If wants an inversion list returned, we can't optimize to something
16869      * else. */
16870     if (ret_invlist) {
16871         optimizable = FALSE;
16872     }
16873
16874     DEBUG_PARSE("clas");
16875
16876 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
16877     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
16878                                    && UNICODE_DOT_DOT_VERSION == 0)
16879     allow_mutiple_chars = FALSE;
16880 #endif
16881
16882     /* We include the /i status at the beginning of this so that we can
16883      * know it at runtime */
16884     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
16885     initial_listsv_len = SvCUR(listsv);
16886     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
16887
16888     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16889
16890     assert(RExC_parse <= RExC_end);
16891
16892     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
16893         RExC_parse++;
16894         invert = TRUE;
16895         allow_mutiple_chars = FALSE;
16896         MARK_NAUGHTY(1);
16897         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16898     }
16899
16900     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16901     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16902         int maybe_class = handle_possible_posix(pRExC_state,
16903                                                 RExC_parse,
16904                                                 &not_posix_region_end,
16905                                                 NULL,
16906                                                 TRUE /* checking only */);
16907         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16908             ckWARN4reg(not_posix_region_end,
16909                     "POSIX syntax [%c %c] belongs inside character classes%s",
16910                     *RExC_parse, *RExC_parse,
16911                     (maybe_class == OOB_NAMEDCLASS)
16912                     ? ((POSIXCC_NOTYET(*RExC_parse))
16913                         ? " (but this one isn't implemented)"
16914                         : " (but this one isn't fully valid)")
16915                     : ""
16916                     );
16917         }
16918     }
16919
16920     /* If the caller wants us to just parse a single element, accomplish this
16921      * by faking the loop ending condition */
16922     if (stop_at_1 && RExC_end > RExC_parse) {
16923         stop_ptr = RExC_parse + 1;
16924     }
16925
16926     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16927     if (UCHARAT(RExC_parse) == ']')
16928         goto charclassloop;
16929
16930     while (1) {
16931
16932         if (   posix_warnings
16933             && av_tindex_skip_len_mg(posix_warnings) >= 0
16934             && RExC_parse > not_posix_region_end)
16935         {
16936             /* Warnings about posix class issues are considered tentative until
16937              * we are far enough along in the parse that we can no longer
16938              * change our mind, at which point we output them.  This is done
16939              * each time through the loop so that a later class won't zap them
16940              * before they have been dealt with. */
16941             output_posix_warnings(pRExC_state, posix_warnings);
16942         }
16943
16944         if  (RExC_parse >= stop_ptr) {
16945             break;
16946         }
16947
16948         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16949
16950         if  (UCHARAT(RExC_parse) == ']') {
16951             break;
16952         }
16953
16954       charclassloop:
16955
16956         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16957         save_value = value;
16958         save_prevvalue = prevvalue;
16959
16960         if (!range) {
16961             rangebegin = RExC_parse;
16962             element_count++;
16963             non_portable_endpoint = 0;
16964         }
16965         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16966             value = utf8n_to_uvchr((U8*)RExC_parse,
16967                                    RExC_end - RExC_parse,
16968                                    &numlen, UTF8_ALLOW_DEFAULT);
16969             RExC_parse += numlen;
16970         }
16971         else
16972             value = UCHARAT(RExC_parse++);
16973
16974         if (value == '[') {
16975             char * posix_class_end;
16976             namedclass = handle_possible_posix(pRExC_state,
16977                                                RExC_parse,
16978                                                &posix_class_end,
16979                                                do_posix_warnings ? &posix_warnings : NULL,
16980                                                FALSE    /* die if error */);
16981             if (namedclass > OOB_NAMEDCLASS) {
16982
16983                 /* If there was an earlier attempt to parse this particular
16984                  * posix class, and it failed, it was a false alarm, as this
16985                  * successful one proves */
16986                 if (   posix_warnings
16987                     && av_tindex_skip_len_mg(posix_warnings) >= 0
16988                     && not_posix_region_end >= RExC_parse
16989                     && not_posix_region_end <= posix_class_end)
16990                 {
16991                     av_undef(posix_warnings);
16992                 }
16993
16994                 RExC_parse = posix_class_end;
16995             }
16996             else if (namedclass == OOB_NAMEDCLASS) {
16997                 not_posix_region_end = posix_class_end;
16998             }
16999             else {
17000                 namedclass = OOB_NAMEDCLASS;
17001             }
17002         }
17003         else if (   RExC_parse - 1 > not_posix_region_end
17004                  && MAYBE_POSIXCC(value))
17005         {
17006             (void) handle_possible_posix(
17007                         pRExC_state,
17008                         RExC_parse - 1,  /* -1 because parse has already been
17009                                             advanced */
17010                         &not_posix_region_end,
17011                         do_posix_warnings ? &posix_warnings : NULL,
17012                         TRUE /* checking only */);
17013         }
17014         else if (  strict && ! skip_white
17015                  && (   _generic_isCC(value, _CC_VERTSPACE)
17016                      || is_VERTWS_cp_high(value)))
17017         {
17018             vFAIL("Literal vertical space in [] is illegal except under /x");
17019         }
17020         else if (value == '\\') {
17021             /* Is a backslash; get the code point of the char after it */
17022
17023             if (RExC_parse >= RExC_end) {
17024                 vFAIL("Unmatched [");
17025             }
17026
17027             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17028                 value = utf8n_to_uvchr((U8*)RExC_parse,
17029                                    RExC_end - RExC_parse,
17030                                    &numlen, UTF8_ALLOW_DEFAULT);
17031                 RExC_parse += numlen;
17032             }
17033             else
17034                 value = UCHARAT(RExC_parse++);
17035
17036             /* Some compilers cannot handle switching on 64-bit integer
17037              * values, therefore value cannot be an UV.  Yes, this will
17038              * be a problem later if we want switch on Unicode.
17039              * A similar issue a little bit later when switching on
17040              * namedclass. --jhi */
17041
17042             /* If the \ is escaping white space when white space is being
17043              * skipped, it means that that white space is wanted literally, and
17044              * is already in 'value'.  Otherwise, need to translate the escape
17045              * into what it signifies. */
17046             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17047
17048             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
17049             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
17050             case 's':   namedclass = ANYOF_SPACE;       break;
17051             case 'S':   namedclass = ANYOF_NSPACE;      break;
17052             case 'd':   namedclass = ANYOF_DIGIT;       break;
17053             case 'D':   namedclass = ANYOF_NDIGIT;      break;
17054             case 'v':   namedclass = ANYOF_VERTWS;      break;
17055             case 'V':   namedclass = ANYOF_NVERTWS;     break;
17056             case 'h':   namedclass = ANYOF_HORIZWS;     break;
17057             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
17058             case 'N':  /* Handle \N{NAME} in class */
17059                 {
17060                     const char * const backslash_N_beg = RExC_parse - 2;
17061                     int cp_count;
17062
17063                     if (! grok_bslash_N(pRExC_state,
17064                                         NULL,      /* No regnode */
17065                                         &value,    /* Yes single value */
17066                                         &cp_count, /* Multiple code pt count */
17067                                         flagp,
17068                                         strict,
17069                                         depth)
17070                     ) {
17071
17072                         if (*flagp & NEED_UTF8)
17073                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17074
17075                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17076
17077                         if (cp_count < 0) {
17078                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17079                         }
17080                         else if (cp_count == 0) {
17081                             ckWARNreg(RExC_parse,
17082                               "Ignoring zero length \\N{} in character class");
17083                         }
17084                         else { /* cp_count > 1 */
17085                             assert(cp_count > 1);
17086                             if (! RExC_in_multi_char_class) {
17087                                 if ( ! allow_mutiple_chars
17088                                     || invert
17089                                     || range
17090                                     || *RExC_parse == '-')
17091                                 {
17092                                     if (strict) {
17093                                         RExC_parse--;
17094                                         vFAIL("\\N{} here is restricted to one character");
17095                                     }
17096                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17097                                     break; /* <value> contains the first code
17098                                               point. Drop out of the switch to
17099                                               process it */
17100                                 }
17101                                 else {
17102                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17103                                                  RExC_parse - backslash_N_beg);
17104                                     multi_char_matches
17105                                         = add_multi_match(multi_char_matches,
17106                                                           multi_char_N,
17107                                                           cp_count);
17108                                 }
17109                             }
17110                         } /* End of cp_count != 1 */
17111
17112                         /* This element should not be processed further in this
17113                          * class */
17114                         element_count--;
17115                         value = save_value;
17116                         prevvalue = save_prevvalue;
17117                         continue;   /* Back to top of loop to get next char */
17118                     }
17119
17120                     /* Here, is a single code point, and <value> contains it */
17121                     unicode_range = TRUE;   /* \N{} are Unicode */
17122                 }
17123                 break;
17124             case 'p':
17125             case 'P':
17126                 {
17127                 char *e;
17128
17129                 /* \p means they want Unicode semantics */
17130                 REQUIRE_UNI_RULES(flagp, 0);
17131
17132                 if (RExC_parse >= RExC_end)
17133                     vFAIL2("Empty \\%c", (U8)value);
17134                 if (*RExC_parse == '{') {
17135                     const U8 c = (U8)value;
17136                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17137                     if (!e) {
17138                         RExC_parse++;
17139                         vFAIL2("Missing right brace on \\%c{}", c);
17140                     }
17141
17142                     RExC_parse++;
17143
17144                     /* White space is allowed adjacent to the braces and after
17145                      * any '^', even when not under /x */
17146                     while (isSPACE(*RExC_parse)) {
17147                          RExC_parse++;
17148                     }
17149
17150                     if (UCHARAT(RExC_parse) == '^') {
17151
17152                         /* toggle.  (The rhs xor gets the single bit that
17153                          * differs between P and p; the other xor inverts just
17154                          * that bit) */
17155                         value ^= 'P' ^ 'p';
17156
17157                         RExC_parse++;
17158                         while (isSPACE(*RExC_parse)) {
17159                             RExC_parse++;
17160                         }
17161                     }
17162
17163                     if (e == RExC_parse)
17164                         vFAIL2("Empty \\%c{}", c);
17165
17166                     n = e - RExC_parse;
17167                     while (isSPACE(*(RExC_parse + n - 1)))
17168                         n--;
17169
17170                 }   /* The \p isn't immediately followed by a '{' */
17171                 else if (! isALPHA(*RExC_parse)) {
17172                     RExC_parse += (UTF)
17173                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17174                                   : 1;
17175                     vFAIL2("Character following \\%c must be '{' or a "
17176                            "single-character Unicode property name",
17177                            (U8) value);
17178                 }
17179                 else {
17180                     e = RExC_parse;
17181                     n = 1;
17182                 }
17183                 {
17184                     char* name = RExC_parse;
17185
17186                     /* Any message returned about expanding the definition */
17187                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17188
17189                     /* If set TRUE, the property is user-defined as opposed to
17190                      * official Unicode */
17191                     bool user_defined = FALSE;
17192
17193                     SV * prop_definition = parse_uniprop_string(
17194                                             name, n, UTF, FOLD,
17195                                             FALSE, /* This is compile-time */
17196
17197                                             /* We can't defer this defn when
17198                                              * the full result is required in
17199                                              * this call */
17200                                             ! cBOOL(ret_invlist),
17201
17202                                             &user_defined,
17203                                             msg,
17204                                             0 /* Base level */
17205                                            );
17206                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17207                         assert(prop_definition == NULL);
17208                         RExC_parse = e + 1;
17209                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17210                                                thing so, or else the display is
17211                                                mojibake */
17212                             RExC_utf8 = TRUE;
17213                         }
17214                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17215                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17216                                     SvCUR(msg), SvPVX(msg)));
17217                     }
17218
17219                     if (! is_invlist(prop_definition)) {
17220
17221                         /* Here, the definition isn't known, so we have gotten
17222                          * returned a string that will be evaluated if and when
17223                          * encountered at runtime.  We add it to the list of
17224                          * such properties, along with whether it should be
17225                          * complemented or not */
17226                         if (value == 'P') {
17227                             sv_catpvs(listsv, "!");
17228                         }
17229                         else {
17230                             sv_catpvs(listsv, "+");
17231                         }
17232                         sv_catsv(listsv, prop_definition);
17233
17234                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17235
17236                         /* We don't know yet what this matches, so have to flag
17237                          * it */
17238                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17239                     }
17240                     else {
17241                         assert (prop_definition && is_invlist(prop_definition));
17242
17243                         /* Here we do have the complete property definition
17244                          *
17245                          * Temporary workaround for [perl #133136].  For this
17246                          * precise input that is in the .t that is failing,
17247                          * load utf8.pm, which is what the test wants, so that
17248                          * that .t passes */
17249                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17250                                         "foo\\p{Alnum}")
17251                             && ! hv_common(GvHVn(PL_incgv),
17252                                            NULL,
17253                                            "utf8.pm", sizeof("utf8.pm") - 1,
17254                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17255                         {
17256                             require_pv("utf8.pm");
17257                         }
17258
17259                         if (! user_defined &&
17260                             /* We warn on matching an above-Unicode code point
17261                              * if the match would return true, except don't
17262                              * warn for \p{All}, which has exactly one element
17263                              * = 0 */
17264                             (_invlist_contains_cp(prop_definition, 0x110000)
17265                                 && (! (_invlist_len(prop_definition) == 1
17266                                        && *invlist_array(prop_definition) == 0))))
17267                         {
17268                             warn_super = TRUE;
17269                         }
17270
17271                         /* Invert if asking for the complement */
17272                         if (value == 'P') {
17273                             _invlist_union_complement_2nd(properties,
17274                                                           prop_definition,
17275                                                           &properties);
17276                         }
17277                         else {
17278                             _invlist_union(properties, prop_definition, &properties);
17279                         }
17280                     }
17281                 }
17282
17283                 RExC_parse = e + 1;
17284                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17285                                                 named */
17286                 }
17287                 break;
17288             case 'n':   value = '\n';                   break;
17289             case 'r':   value = '\r';                   break;
17290             case 't':   value = '\t';                   break;
17291             case 'f':   value = '\f';                   break;
17292             case 'b':   value = '\b';                   break;
17293             case 'e':   value = ESC_NATIVE;             break;
17294             case 'a':   value = '\a';                   break;
17295             case 'o':
17296                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17297                 {
17298                     const char* error_msg;
17299                     bool valid = grok_bslash_o(&RExC_parse,
17300                                                RExC_end,
17301                                                &value,
17302                                                &error_msg,
17303                                                TO_OUTPUT_WARNINGS(RExC_parse),
17304                                                strict,
17305                                                silence_non_portable,
17306                                                UTF);
17307                     if (! valid) {
17308                         vFAIL(error_msg);
17309                     }
17310                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17311                 }
17312                 non_portable_endpoint++;
17313                 break;
17314             case 'x':
17315                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17316                 {
17317                     const char* error_msg;
17318                     bool valid = grok_bslash_x(&RExC_parse,
17319                                                RExC_end,
17320                                                &value,
17321                                                &error_msg,
17322                                                TO_OUTPUT_WARNINGS(RExC_parse),
17323                                                strict,
17324                                                silence_non_portable,
17325                                                UTF);
17326                     if (! valid) {
17327                         vFAIL(error_msg);
17328                     }
17329                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17330                 }
17331                 non_portable_endpoint++;
17332                 break;
17333             case 'c':
17334                 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17335                 UPDATE_WARNINGS_LOC(RExC_parse);
17336                 RExC_parse++;
17337                 non_portable_endpoint++;
17338                 break;
17339             case '0': case '1': case '2': case '3': case '4':
17340             case '5': case '6': case '7':
17341                 {
17342                     /* Take 1-3 octal digits */
17343                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17344                     numlen = (strict) ? 4 : 3;
17345                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17346                     RExC_parse += numlen;
17347                     if (numlen != 3) {
17348                         if (strict) {
17349                             RExC_parse += (UTF)
17350                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17351                                           : 1;
17352                             vFAIL("Need exactly 3 octal digits");
17353                         }
17354                         else if (   numlen < 3 /* like \08, \178 */
17355                                  && RExC_parse < RExC_end
17356                                  && isDIGIT(*RExC_parse)
17357                                  && ckWARN(WARN_REGEXP))
17358                         {
17359                             reg_warn_non_literal_string(
17360                                  RExC_parse + 1,
17361                                  form_short_octal_warning(RExC_parse, numlen));
17362                         }
17363                     }
17364                     non_portable_endpoint++;
17365                     break;
17366                 }
17367             default:
17368                 /* Allow \_ to not give an error */
17369                 if (isWORDCHAR(value) && value != '_') {
17370                     if (strict) {
17371                         vFAIL2("Unrecognized escape \\%c in character class",
17372                                (int)value);
17373                     }
17374                     else {
17375                         ckWARN2reg(RExC_parse,
17376                             "Unrecognized escape \\%c in character class passed through",
17377                             (int)value);
17378                     }
17379                 }
17380                 break;
17381             }   /* End of switch on char following backslash */
17382         } /* end of handling backslash escape sequences */
17383
17384         /* Here, we have the current token in 'value' */
17385
17386         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17387             U8 classnum;
17388
17389             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
17390              * literal, as is the character that began the false range, i.e.
17391              * the 'a' in the examples */
17392             if (range) {
17393                 const int w = (RExC_parse >= rangebegin)
17394                                 ? RExC_parse - rangebegin
17395                                 : 0;
17396                 if (strict) {
17397                     vFAIL2utf8f(
17398                         "False [] range \"%" UTF8f "\"",
17399                         UTF8fARG(UTF, w, rangebegin));
17400                 }
17401                 else {
17402                     ckWARN2reg(RExC_parse,
17403                         "False [] range \"%" UTF8f "\"",
17404                         UTF8fARG(UTF, w, rangebegin));
17405                     cp_list = add_cp_to_invlist(cp_list, '-');
17406                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17407                                                             prevvalue);
17408                 }
17409
17410                 range = 0; /* this was not a true range */
17411                 element_count += 2; /* So counts for three values */
17412             }
17413
17414             classnum = namedclass_to_classnum(namedclass);
17415
17416             if (LOC && namedclass < ANYOF_POSIXL_MAX
17417 #ifndef HAS_ISASCII
17418                 && classnum != _CC_ASCII
17419 #endif
17420             ) {
17421                 SV* scratch_list = NULL;
17422
17423                 /* What the Posix classes (like \w, [:space:]) match isn't
17424                  * generally knowable under locale until actual match time.  A
17425                  * special node is used for these which has extra space for a
17426                  * bitmap, with a bit reserved for each named class that is to
17427                  * be matched against.  (This isn't needed for \p{} and
17428                  * pseudo-classes, as they are not affected by locale, and
17429                  * hence are dealt with separately.)  However, if a named class
17430                  * and its complement are both present, then it matches
17431                  * everything, and there is no runtime dependency.  Odd numbers
17432                  * are the complements of the next lower number, so xor works.
17433                  * (Note that something like [\w\D] should match everything,
17434                  * because \d should be a proper subset of \w.  But rather than
17435                  * trust that the locale is well behaved, we leave this to
17436                  * runtime to sort out) */
17437                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
17438                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
17439                     POSIXL_ZERO(posixl);
17440                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
17441                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
17442                     continue;   /* We could ignore the rest of the class, but
17443                                    best to parse it for any errors */
17444                 }
17445                 else { /* Here, isn't the complement of any already parsed
17446                           class */
17447                     POSIXL_SET(posixl, namedclass);
17448                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17449                     anyof_flags |= ANYOF_MATCHES_POSIXL;
17450
17451                     /* The above-Latin1 characters are not subject to locale
17452                      * rules.  Just add them to the unconditionally-matched
17453                      * list */
17454
17455                     /* Get the list of the above-Latin1 code points this
17456                      * matches */
17457                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17458                                             PL_XPosix_ptrs[classnum],
17459
17460                                             /* Odd numbers are complements,
17461                                              * like NDIGIT, NASCII, ... */
17462                                             namedclass % 2 != 0,
17463                                             &scratch_list);
17464                     /* Checking if 'cp_list' is NULL first saves an extra
17465                      * clone.  Its reference count will be decremented at the
17466                      * next union, etc, or if this is the only instance, at the
17467                      * end of the routine */
17468                     if (! cp_list) {
17469                         cp_list = scratch_list;
17470                     }
17471                     else {
17472                         _invlist_union(cp_list, scratch_list, &cp_list);
17473                         SvREFCNT_dec_NN(scratch_list);
17474                     }
17475                     continue;   /* Go get next character */
17476                 }
17477             }
17478             else {
17479
17480                 /* Here, is not /l, or is a POSIX class for which /l doesn't
17481                  * matter (or is a Unicode property, which is skipped here). */
17482                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
17483                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17484
17485                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
17486                          * nor /l make a difference in what these match,
17487                          * therefore we just add what they match to cp_list. */
17488                         if (classnum != _CC_VERTSPACE) {
17489                             assert(   namedclass == ANYOF_HORIZWS
17490                                    || namedclass == ANYOF_NHORIZWS);
17491
17492                             /* It turns out that \h is just a synonym for
17493                              * XPosixBlank */
17494                             classnum = _CC_BLANK;
17495                         }
17496
17497                         _invlist_union_maybe_complement_2nd(
17498                                 cp_list,
17499                                 PL_XPosix_ptrs[classnum],
17500                                 namedclass % 2 != 0,    /* Complement if odd
17501                                                           (NHORIZWS, NVERTWS)
17502                                                         */
17503                                 &cp_list);
17504                     }
17505                 }
17506                 else if (   AT_LEAST_UNI_SEMANTICS
17507                          || classnum == _CC_ASCII
17508                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
17509                                                    || classnum == _CC_XDIGIT)))
17510                 {
17511                     /* We usually have to worry about /d affecting what POSIX
17512                      * classes match, with special code needed because we won't
17513                      * know until runtime what all matches.  But there is no
17514                      * extra work needed under /u and /a; and [:ascii:] is
17515                      * unaffected by /d; and :digit: and :xdigit: don't have
17516                      * runtime differences under /d.  So we can special case
17517                      * these, and avoid some extra work below, and at runtime.
17518                      * */
17519                     _invlist_union_maybe_complement_2nd(
17520                                                      simple_posixes,
17521                                                       ((AT_LEAST_ASCII_RESTRICTED)
17522                                                        ? PL_Posix_ptrs[classnum]
17523                                                        : PL_XPosix_ptrs[classnum]),
17524                                                      namedclass % 2 != 0,
17525                                                      &simple_posixes);
17526                 }
17527                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
17528                            complement and use nposixes */
17529                     SV** posixes_ptr = namedclass % 2 == 0
17530                                        ? &posixes
17531                                        : &nposixes;
17532                     _invlist_union_maybe_complement_2nd(
17533                                                      *posixes_ptr,
17534                                                      PL_XPosix_ptrs[classnum],
17535                                                      namedclass % 2 != 0,
17536                                                      posixes_ptr);
17537                 }
17538             }
17539         } /* end of namedclass \blah */
17540
17541         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17542
17543         /* If 'range' is set, 'value' is the ending of a range--check its
17544          * validity.  (If value isn't a single code point in the case of a
17545          * range, we should have figured that out above in the code that
17546          * catches false ranges).  Later, we will handle each individual code
17547          * point in the range.  If 'range' isn't set, this could be the
17548          * beginning of a range, so check for that by looking ahead to see if
17549          * the next real character to be processed is the range indicator--the
17550          * minus sign */
17551
17552         if (range) {
17553 #ifdef EBCDIC
17554             /* For unicode ranges, we have to test that the Unicode as opposed
17555              * to the native values are not decreasing.  (Above 255, there is
17556              * no difference between native and Unicode) */
17557             if (unicode_range && prevvalue < 255 && value < 255) {
17558                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17559                     goto backwards_range;
17560                 }
17561             }
17562             else
17563 #endif
17564             if (prevvalue > value) /* b-a */ {
17565                 int w;
17566 #ifdef EBCDIC
17567               backwards_range:
17568 #endif
17569                 w = RExC_parse - rangebegin;
17570                 vFAIL2utf8f(
17571                     "Invalid [] range \"%" UTF8f "\"",
17572                     UTF8fARG(UTF, w, rangebegin));
17573                 NOT_REACHED; /* NOTREACHED */
17574             }
17575         }
17576         else {
17577             prevvalue = value; /* save the beginning of the potential range */
17578             if (! stop_at_1     /* Can't be a range if parsing just one thing */
17579                 && *RExC_parse == '-')
17580             {
17581                 char* next_char_ptr = RExC_parse + 1;
17582
17583                 /* Get the next real char after the '-' */
17584                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17585
17586                 /* If the '-' is at the end of the class (just before the ']',
17587                  * it is a literal minus; otherwise it is a range */
17588                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17589                     RExC_parse = next_char_ptr;
17590
17591                     /* a bad range like \w-, [:word:]- ? */
17592                     if (namedclass > OOB_NAMEDCLASS) {
17593                         if (strict || ckWARN(WARN_REGEXP)) {
17594                             const int w = RExC_parse >= rangebegin
17595                                           ?  RExC_parse - rangebegin
17596                                           : 0;
17597                             if (strict) {
17598                                 vFAIL4("False [] range \"%*.*s\"",
17599                                     w, w, rangebegin);
17600                             }
17601                             else {
17602                                 vWARN4(RExC_parse,
17603                                     "False [] range \"%*.*s\"",
17604                                     w, w, rangebegin);
17605                             }
17606                         }
17607                         cp_list = add_cp_to_invlist(cp_list, '-');
17608                         element_count++;
17609                     } else
17610                         range = 1;      /* yeah, it's a range! */
17611                     continue;   /* but do it the next time */
17612                 }
17613             }
17614         }
17615
17616         if (namedclass > OOB_NAMEDCLASS) {
17617             continue;
17618         }
17619
17620         /* Here, we have a single value this time through the loop, and
17621          * <prevvalue> is the beginning of the range, if any; or <value> if
17622          * not. */
17623
17624         /* non-Latin1 code point implies unicode semantics. */
17625         if (value > 255) {
17626             REQUIRE_UNI_RULES(flagp, 0);
17627         }
17628
17629         /* Ready to process either the single value, or the completed range.
17630          * For single-valued non-inverted ranges, we consider the possibility
17631          * of multi-char folds.  (We made a conscious decision to not do this
17632          * for the other cases because it can often lead to non-intuitive
17633          * results.  For example, you have the peculiar case that:
17634          *  "s s" =~ /^[^\xDF]+$/i => Y
17635          *  "ss"  =~ /^[^\xDF]+$/i => N
17636          *
17637          * See [perl #89750] */
17638         if (FOLD && allow_mutiple_chars && value == prevvalue) {
17639             if (    value == LATIN_SMALL_LETTER_SHARP_S
17640                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17641                                                         value)))
17642             {
17643                 /* Here <value> is indeed a multi-char fold.  Get what it is */
17644
17645                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17646                 STRLEN foldlen;
17647
17648                 UV folded = _to_uni_fold_flags(
17649                                 value,
17650                                 foldbuf,
17651                                 &foldlen,
17652                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17653                                                    ? FOLD_FLAGS_NOMIX_ASCII
17654                                                    : 0)
17655                                 );
17656
17657                 /* Here, <folded> should be the first character of the
17658                  * multi-char fold of <value>, with <foldbuf> containing the
17659                  * whole thing.  But, if this fold is not allowed (because of
17660                  * the flags), <fold> will be the same as <value>, and should
17661                  * be processed like any other character, so skip the special
17662                  * handling */
17663                 if (folded != value) {
17664
17665                     /* Skip if we are recursed, currently parsing the class
17666                      * again.  Otherwise add this character to the list of
17667                      * multi-char folds. */
17668                     if (! RExC_in_multi_char_class) {
17669                         STRLEN cp_count = utf8_length(foldbuf,
17670                                                       foldbuf + foldlen);
17671                         SV* multi_fold = sv_2mortal(newSVpvs(""));
17672
17673                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17674
17675                         multi_char_matches
17676                                         = add_multi_match(multi_char_matches,
17677                                                           multi_fold,
17678                                                           cp_count);
17679
17680                     }
17681
17682                     /* This element should not be processed further in this
17683                      * class */
17684                     element_count--;
17685                     value = save_value;
17686                     prevvalue = save_prevvalue;
17687                     continue;
17688                 }
17689             }
17690         }
17691
17692         if (strict && ckWARN(WARN_REGEXP)) {
17693             if (range) {
17694
17695                 /* If the range starts above 255, everything is portable and
17696                  * likely to be so for any forseeable character set, so don't
17697                  * warn. */
17698                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17699                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17700                 }
17701                 else if (prevvalue != value) {
17702
17703                     /* Under strict, ranges that stop and/or end in an ASCII
17704                      * printable should have each end point be a portable value
17705                      * for it (preferably like 'A', but we don't warn if it is
17706                      * a (portable) Unicode name or code point), and the range
17707                      * must be be all digits or all letters of the same case.
17708                      * Otherwise, the range is non-portable and unclear as to
17709                      * what it contains */
17710                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
17711                         && (          non_portable_endpoint
17712                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17713                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
17714                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
17715                     ))) {
17716                         vWARN(RExC_parse, "Ranges of ASCII printables should"
17717                                           " be some subset of \"0-9\","
17718                                           " \"A-Z\", or \"a-z\"");
17719                     }
17720                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17721                         SSize_t index_start;
17722                         SSize_t index_final;
17723
17724                         /* But the nature of Unicode and languages mean we
17725                          * can't do the same checks for above-ASCII ranges,
17726                          * except in the case of digit ones.  These should
17727                          * contain only digits from the same group of 10.  The
17728                          * ASCII case is handled just above.  Hence here, the
17729                          * range could be a range of digits.  First some
17730                          * unlikely special cases.  Grandfather in that a range
17731                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17732                          * if its starting value is one of the 10 digits prior
17733                          * to it.  This is because it is an alternate way of
17734                          * writing 19D1, and some people may expect it to be in
17735                          * that group.  But it is bad, because it won't give
17736                          * the expected results.  In Unicode 5.2 it was
17737                          * considered to be in that group (of 11, hence), but
17738                          * this was fixed in the next version */
17739
17740                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17741                             goto warn_bad_digit_range;
17742                         }
17743                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
17744                                           &&     value <= 0x1D7FF))
17745                         {
17746                             /* This is the only other case currently in Unicode
17747                              * where the algorithm below fails.  The code
17748                              * points just above are the end points of a single
17749                              * range containing only decimal digits.  It is 5
17750                              * different series of 0-9.  All other ranges of
17751                              * digits currently in Unicode are just a single
17752                              * series.  (And mktables will notify us if a later
17753                              * Unicode version breaks this.)
17754                              *
17755                              * If the range being checked is at most 9 long,
17756                              * and the digit values represented are in
17757                              * numerical order, they are from the same series.
17758                              * */
17759                             if (         value - prevvalue > 9
17760                                 ||    (((    value - 0x1D7CE) % 10)
17761                                      <= (prevvalue - 0x1D7CE) % 10))
17762                             {
17763                                 goto warn_bad_digit_range;
17764                             }
17765                         }
17766                         else {
17767
17768                             /* For all other ranges of digits in Unicode, the
17769                              * algorithm is just to check if both end points
17770                              * are in the same series, which is the same range.
17771                              * */
17772                             index_start = _invlist_search(
17773                                                     PL_XPosix_ptrs[_CC_DIGIT],
17774                                                     prevvalue);
17775
17776                             /* Warn if the range starts and ends with a digit,
17777                              * and they are not in the same group of 10. */
17778                             if (   index_start >= 0
17779                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17780                                 && (index_final =
17781                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17782                                                     value)) != index_start
17783                                 && index_final >= 0
17784                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17785                             {
17786                               warn_bad_digit_range:
17787                                 vWARN(RExC_parse, "Ranges of digits should be"
17788                                                   " from the same group of"
17789                                                   " 10");
17790                             }
17791                         }
17792                     }
17793                 }
17794             }
17795             if ((! range || prevvalue == value) && non_portable_endpoint) {
17796                 if (isPRINT_A(value)) {
17797                     char literal[3];
17798                     unsigned d = 0;
17799                     if (isBACKSLASHED_PUNCT(value)) {
17800                         literal[d++] = '\\';
17801                     }
17802                     literal[d++] = (char) value;
17803                     literal[d++] = '\0';
17804
17805                     vWARN4(RExC_parse,
17806                            "\"%.*s\" is more clearly written simply as \"%s\"",
17807                            (int) (RExC_parse - rangebegin),
17808                            rangebegin,
17809                            literal
17810                         );
17811                 }
17812                 else if (isMNEMONIC_CNTRL(value)) {
17813                     vWARN4(RExC_parse,
17814                            "\"%.*s\" is more clearly written simply as \"%s\"",
17815                            (int) (RExC_parse - rangebegin),
17816                            rangebegin,
17817                            cntrl_to_mnemonic((U8) value)
17818                         );
17819                 }
17820             }
17821         }
17822
17823         /* Deal with this element of the class */
17824
17825 #ifndef EBCDIC
17826         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17827                                                     prevvalue, value);
17828 #else
17829         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
17830          * that don't require special handling, we can just add the range like
17831          * we do for ASCII platforms */
17832         if ((UNLIKELY(prevvalue == 0) && value >= 255)
17833             || ! (prevvalue < 256
17834                     && (unicode_range
17835                         || (! non_portable_endpoint
17836                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17837                                 || (isUPPER_A(prevvalue)
17838                                     && isUPPER_A(value)))))))
17839         {
17840             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17841                                                         prevvalue, value);
17842         }
17843         else {
17844             /* Here, requires special handling.  This can be because it is a
17845              * range whose code points are considered to be Unicode, and so
17846              * must be individually translated into native, or because its a
17847              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
17848              * EBCDIC, but we have defined them to include only the "expected"
17849              * upper or lower case ASCII alphabetics.  Subranges above 255 are
17850              * the same in native and Unicode, so can be added as a range */
17851             U8 start = NATIVE_TO_LATIN1(prevvalue);
17852             unsigned j;
17853             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17854             for (j = start; j <= end; j++) {
17855                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17856             }
17857             if (value > 255) {
17858                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17859                                                             256, value);
17860             }
17861         }
17862 #endif
17863
17864         range = 0; /* this range (if it was one) is done now */
17865     } /* End of loop through all the text within the brackets */
17866
17867     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17868         output_posix_warnings(pRExC_state, posix_warnings);
17869     }
17870
17871     /* If anything in the class expands to more than one character, we have to
17872      * deal with them by building up a substitute parse string, and recursively
17873      * calling reg() on it, instead of proceeding */
17874     if (multi_char_matches) {
17875         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17876         I32 cp_count;
17877         STRLEN len;
17878         char *save_end = RExC_end;
17879         char *save_parse = RExC_parse;
17880         char *save_start = RExC_start;
17881         Size_t constructed_prefix_len = 0; /* This gives the length of the
17882                                               constructed portion of the
17883                                               substitute parse. */
17884         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
17885                                        a "|" */
17886         I32 reg_flags;
17887
17888         assert(! invert);
17889         /* Only one level of recursion allowed */
17890         assert(RExC_copy_start_in_constructed == RExC_precomp);
17891
17892 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
17893            because too confusing */
17894         if (invert) {
17895             sv_catpvs(substitute_parse, "(?:");
17896         }
17897 #endif
17898
17899         /* Look at the longest folds first */
17900         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17901                         cp_count > 0;
17902                         cp_count--)
17903         {
17904
17905             if (av_exists(multi_char_matches, cp_count)) {
17906                 AV** this_array_ptr;
17907                 SV* this_sequence;
17908
17909                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17910                                                  cp_count, FALSE);
17911                 while ((this_sequence = av_pop(*this_array_ptr)) !=
17912                                                                 &PL_sv_undef)
17913                 {
17914                     if (! first_time) {
17915                         sv_catpvs(substitute_parse, "|");
17916                     }
17917                     first_time = FALSE;
17918
17919                     sv_catpv(substitute_parse, SvPVX(this_sequence));
17920                 }
17921             }
17922         }
17923
17924         /* If the character class contains anything else besides these
17925          * multi-character folds, have to include it in recursive parsing */
17926         if (element_count) {
17927             sv_catpvs(substitute_parse, "|[");
17928             constructed_prefix_len = SvCUR(substitute_parse);
17929             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17930
17931             /* Put in a closing ']' only if not going off the end, as otherwise
17932              * we are adding something that really isn't there */
17933             if (RExC_parse < RExC_end) {
17934                 sv_catpvs(substitute_parse, "]");
17935             }
17936         }
17937
17938         sv_catpvs(substitute_parse, ")");
17939 #if 0
17940         if (invert) {
17941             /* This is a way to get the parse to skip forward a whole named
17942              * sequence instead of matching the 2nd character when it fails the
17943              * first */
17944             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17945         }
17946 #endif
17947
17948         /* Set up the data structure so that any errors will be properly
17949          * reported.  See the comments at the definition of
17950          * REPORT_LOCATION_ARGS for details */
17951         RExC_copy_start_in_input = (char *) orig_parse;
17952         RExC_start = RExC_parse = SvPV(substitute_parse, len);
17953         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
17954         RExC_end = RExC_parse + len;
17955         RExC_in_multi_char_class = 1;
17956
17957         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
17958
17959         *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
17960
17961         /* And restore so can parse the rest of the pattern */
17962         RExC_parse = save_parse;
17963         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
17964         RExC_end = save_end;
17965         RExC_in_multi_char_class = 0;
17966         SvREFCNT_dec_NN(multi_char_matches);
17967         return ret;
17968     }
17969
17970     /* If folding, we calculate all characters that could fold to or from the
17971      * ones already on the list */
17972     if (cp_foldable_list) {
17973         if (FOLD) {
17974             UV start, end;      /* End points of code point ranges */
17975
17976             SV* fold_intersection = NULL;
17977             SV** use_list;
17978
17979             /* Our calculated list will be for Unicode rules.  For locale
17980              * matching, we have to keep a separate list that is consulted at
17981              * runtime only when the locale indicates Unicode rules (and we
17982              * don't include potential matches in the ASCII/Latin1 range, as
17983              * any code point could fold to any other, based on the run-time
17984              * locale).   For non-locale, we just use the general list */
17985             if (LOC) {
17986                 use_list = &only_utf8_locale_list;
17987             }
17988             else {
17989                 use_list = &cp_list;
17990             }
17991
17992             /* Only the characters in this class that participate in folds need
17993              * be checked.  Get the intersection of this class and all the
17994              * possible characters that are foldable.  This can quickly narrow
17995              * down a large class */
17996             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
17997                                   &fold_intersection);
17998
17999             /* Now look at the foldable characters in this class individually */
18000             invlist_iterinit(fold_intersection);
18001             while (invlist_iternext(fold_intersection, &start, &end)) {
18002                 UV j;
18003                 UV folded;
18004
18005                 /* Look at every character in the range */
18006                 for (j = start; j <= end; j++) {
18007                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18008                     STRLEN foldlen;
18009                     unsigned int k;
18010                     Size_t folds_count;
18011                     unsigned int first_fold;
18012                     const unsigned int * remaining_folds;
18013
18014                     if (j < 256) {
18015
18016                         /* Under /l, we don't know what code points below 256
18017                          * fold to, except we do know the MICRO SIGN folds to
18018                          * an above-255 character if the locale is UTF-8, so we
18019                          * add it to the special list (in *use_list)  Otherwise
18020                          * we know now what things can match, though some folds
18021                          * are valid under /d only if the target is UTF-8.
18022                          * Those go in a separate list */
18023                         if (      IS_IN_SOME_FOLD_L1(j)
18024                             && ! (LOC && j != MICRO_SIGN))
18025                         {
18026
18027                             /* ASCII is always matched; non-ASCII is matched
18028                              * only under Unicode rules (which could happen
18029                              * under /l if the locale is a UTF-8 one */
18030                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18031                                 *use_list = add_cp_to_invlist(*use_list,
18032                                                             PL_fold_latin1[j]);
18033                             }
18034                             else if (j != PL_fold_latin1[j]) {
18035                                 upper_latin1_only_utf8_matches
18036                                         = add_cp_to_invlist(
18037                                                 upper_latin1_only_utf8_matches,
18038                                                 PL_fold_latin1[j]);
18039                             }
18040                         }
18041
18042                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18043                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18044                         {
18045                             add_above_Latin1_folds(pRExC_state,
18046                                                    (U8) j,
18047                                                    use_list);
18048                         }
18049                         continue;
18050                     }
18051
18052                     /* Here is an above Latin1 character.  We don't have the
18053                      * rules hard-coded for it.  First, get its fold.  This is
18054                      * the simple fold, as the multi-character folds have been
18055                      * handled earlier and separated out */
18056                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18057                                                         (ASCII_FOLD_RESTRICTED)
18058                                                         ? FOLD_FLAGS_NOMIX_ASCII
18059                                                         : 0);
18060
18061                     /* Single character fold of above Latin1.  Add everything
18062                      * in its fold closure to the list that this node should
18063                      * match. */
18064                     folds_count = _inverse_folds(folded, &first_fold,
18065                                                     &remaining_folds);
18066                     for (k = 0; k <= folds_count; k++) {
18067                         UV c = (k == 0)     /* First time through use itself */
18068                                 ? folded
18069                                 : (k == 1)  /* 2nd time use, the first fold */
18070                                    ? first_fold
18071
18072                                      /* Then the remaining ones */
18073                                    : remaining_folds[k-2];
18074
18075                         /* /aa doesn't allow folds between ASCII and non- */
18076                         if ((   ASCII_FOLD_RESTRICTED
18077                             && (isASCII(c) != isASCII(j))))
18078                         {
18079                             continue;
18080                         }
18081
18082                         /* Folds under /l which cross the 255/256 boundary are
18083                          * added to a separate list.  (These are valid only
18084                          * when the locale is UTF-8.) */
18085                         if (c < 256 && LOC) {
18086                             *use_list = add_cp_to_invlist(*use_list, c);
18087                             continue;
18088                         }
18089
18090                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18091                         {
18092                             cp_list = add_cp_to_invlist(cp_list, c);
18093                         }
18094                         else {
18095                             /* Similarly folds involving non-ascii Latin1
18096                              * characters under /d are added to their list */
18097                             upper_latin1_only_utf8_matches
18098                                     = add_cp_to_invlist(
18099                                                 upper_latin1_only_utf8_matches,
18100                                                 c);
18101                         }
18102                     }
18103                 }
18104             }
18105             SvREFCNT_dec_NN(fold_intersection);
18106         }
18107
18108         /* Now that we have finished adding all the folds, there is no reason
18109          * to keep the foldable list separate */
18110         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18111         SvREFCNT_dec_NN(cp_foldable_list);
18112     }
18113
18114     /* And combine the result (if any) with any inversion lists from posix
18115      * classes.  The lists are kept separate up to now because we don't want to
18116      * fold the classes */
18117     if (simple_posixes) {   /* These are the classes known to be unaffected by
18118                                /a, /aa, and /d */
18119         if (cp_list) {
18120             _invlist_union(cp_list, simple_posixes, &cp_list);
18121             SvREFCNT_dec_NN(simple_posixes);
18122         }
18123         else {
18124             cp_list = simple_posixes;
18125         }
18126     }
18127     if (posixes || nposixes) {
18128         if (! DEPENDS_SEMANTICS) {
18129
18130             /* For everything but /d, we can just add the current 'posixes' and
18131              * 'nposixes' to the main list */
18132             if (posixes) {
18133                 if (cp_list) {
18134                     _invlist_union(cp_list, posixes, &cp_list);
18135                     SvREFCNT_dec_NN(posixes);
18136                 }
18137                 else {
18138                     cp_list = posixes;
18139                 }
18140             }
18141             if (nposixes) {
18142                 if (cp_list) {
18143                     _invlist_union(cp_list, nposixes, &cp_list);
18144                     SvREFCNT_dec_NN(nposixes);
18145                 }
18146                 else {
18147                     cp_list = nposixes;
18148                 }
18149             }
18150         }
18151         else {
18152             /* Under /d, things like \w match upper Latin1 characters only if
18153              * the target string is in UTF-8.  But things like \W match all the
18154              * upper Latin1 characters if the target string is not in UTF-8.
18155              *
18156              * Handle the case with something like \W separately */
18157             if (nposixes) {
18158                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18159
18160                 /* A complemented posix class matches all upper Latin1
18161                  * characters if not in UTF-8.  And it matches just certain
18162                  * ones when in UTF-8.  That means those certain ones are
18163                  * matched regardless, so can just be added to the
18164                  * unconditional list */
18165                 if (cp_list) {
18166                     _invlist_union(cp_list, nposixes, &cp_list);
18167                     SvREFCNT_dec_NN(nposixes);
18168                     nposixes = NULL;
18169                 }
18170                 else {
18171                     cp_list = nposixes;
18172                 }
18173
18174                 /* Likewise for 'posixes' */
18175                 _invlist_union(posixes, cp_list, &cp_list);
18176
18177                 /* Likewise for anything else in the range that matched only
18178                  * under UTF-8 */
18179                 if (upper_latin1_only_utf8_matches) {
18180                     _invlist_union(cp_list,
18181                                    upper_latin1_only_utf8_matches,
18182                                    &cp_list);
18183                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18184                     upper_latin1_only_utf8_matches = NULL;
18185                 }
18186
18187                 /* If we don't match all the upper Latin1 characters regardless
18188                  * of UTF-8ness, we have to set a flag to match the rest when
18189                  * not in UTF-8 */
18190                 _invlist_subtract(only_non_utf8_list, cp_list,
18191                                   &only_non_utf8_list);
18192                 if (_invlist_len(only_non_utf8_list) != 0) {
18193                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18194                 }
18195                 SvREFCNT_dec_NN(only_non_utf8_list);
18196             }
18197             else {
18198                 /* Here there were no complemented posix classes.  That means
18199                  * the upper Latin1 characters in 'posixes' match only when the
18200                  * target string is in UTF-8.  So we have to add them to the
18201                  * list of those types of code points, while adding the
18202                  * remainder to the unconditional list.
18203                  *
18204                  * First calculate what they are */
18205                 SV* nonascii_but_latin1_properties = NULL;
18206                 _invlist_intersection(posixes, PL_UpperLatin1,
18207                                       &nonascii_but_latin1_properties);
18208
18209                 /* And add them to the final list of such characters. */
18210                 _invlist_union(upper_latin1_only_utf8_matches,
18211                                nonascii_but_latin1_properties,
18212                                &upper_latin1_only_utf8_matches);
18213
18214                 /* Remove them from what now becomes the unconditional list */
18215                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18216                                   &posixes);
18217
18218                 /* And add those unconditional ones to the final list */
18219                 if (cp_list) {
18220                     _invlist_union(cp_list, posixes, &cp_list);
18221                     SvREFCNT_dec_NN(posixes);
18222                     posixes = NULL;
18223                 }
18224                 else {
18225                     cp_list = posixes;
18226                 }
18227
18228                 SvREFCNT_dec(nonascii_but_latin1_properties);
18229
18230                 /* Get rid of any characters from the conditional list that we
18231                  * now know are matched unconditionally, which may make that
18232                  * list empty */
18233                 _invlist_subtract(upper_latin1_only_utf8_matches,
18234                                   cp_list,
18235                                   &upper_latin1_only_utf8_matches);
18236                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18237                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18238                     upper_latin1_only_utf8_matches = NULL;
18239                 }
18240             }
18241         }
18242     }
18243
18244     /* And combine the result (if any) with any inversion list from properties.
18245      * The lists are kept separate up to now so that we can distinguish the two
18246      * in regards to matching above-Unicode.  A run-time warning is generated
18247      * if a Unicode property is matched against a non-Unicode code point. But,
18248      * we allow user-defined properties to match anything, without any warning,
18249      * and we also suppress the warning if there is a portion of the character
18250      * class that isn't a Unicode property, and which matches above Unicode, \W
18251      * or [\x{110000}] for example.
18252      * (Note that in this case, unlike the Posix one above, there is no
18253      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18254      * forces Unicode semantics */
18255     if (properties) {
18256         if (cp_list) {
18257
18258             /* If it matters to the final outcome, see if a non-property
18259              * component of the class matches above Unicode.  If so, the
18260              * warning gets suppressed.  This is true even if just a single
18261              * such code point is specified, as, though not strictly correct if
18262              * another such code point is matched against, the fact that they
18263              * are using above-Unicode code points indicates they should know
18264              * the issues involved */
18265             if (warn_super) {
18266                 warn_super = ! (invert
18267                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18268             }
18269
18270             _invlist_union(properties, cp_list, &cp_list);
18271             SvREFCNT_dec_NN(properties);
18272         }
18273         else {
18274             cp_list = properties;
18275         }
18276
18277         if (warn_super) {
18278             anyof_flags
18279              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18280
18281             /* Because an ANYOF node is the only one that warns, this node
18282              * can't be optimized into something else */
18283             optimizable = FALSE;
18284         }
18285     }
18286
18287     /* Here, we have calculated what code points should be in the character
18288      * class.
18289      *
18290      * Now we can see about various optimizations.  Fold calculation (which we
18291      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18292      * would invert to include K, which under /i would match k, which it
18293      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18294      * folded until runtime */
18295
18296     /* If we didn't do folding, it's because some information isn't available
18297      * until runtime; set the run-time fold flag for these  We know to set the
18298      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18299      * at least one 0-255 range code point */
18300     if (LOC && FOLD) {
18301
18302         /* Some things on the list might be unconditionally included because of
18303          * other components.  Remove them, and clean up the list if it goes to
18304          * 0 elements */
18305         if (only_utf8_locale_list && cp_list) {
18306             _invlist_subtract(only_utf8_locale_list, cp_list,
18307                               &only_utf8_locale_list);
18308
18309             if (_invlist_len(only_utf8_locale_list) == 0) {
18310                 SvREFCNT_dec_NN(only_utf8_locale_list);
18311                 only_utf8_locale_list = NULL;
18312             }
18313         }
18314         if (    only_utf8_locale_list
18315             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18316                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18317         {
18318             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18319             anyof_flags
18320                  |= ANYOFL_FOLD
18321                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18322         }
18323         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
18324             UV start, end;
18325             invlist_iterinit(cp_list);
18326             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
18327                 anyof_flags |= ANYOFL_FOLD;
18328                 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18329             }
18330             invlist_iterfinish(cp_list);
18331         }
18332     }
18333     else if (   DEPENDS_SEMANTICS
18334              && (    upper_latin1_only_utf8_matches
18335                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18336     {
18337         RExC_seen_d_op = TRUE;
18338         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18339     }
18340
18341     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18342      * compile time. */
18343     if (     cp_list
18344         &&   invert
18345         && ! has_runtime_dependency)
18346     {
18347         _invlist_invert(cp_list);
18348
18349         /* Clear the invert flag since have just done it here */
18350         invert = FALSE;
18351     }
18352
18353     if (ret_invlist) {
18354         *ret_invlist = cp_list;
18355
18356         return RExC_emit;
18357     }
18358
18359     /* All possible optimizations below still have these characteristics.
18360      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18361      * routine) */
18362     *flagp |= HASWIDTH|SIMPLE;
18363
18364     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18365         RExC_contains_locale = 1;
18366     }
18367
18368     /* Some character classes are equivalent to other nodes.  Such nodes take
18369      * up less room, and some nodes require fewer operations to execute, than
18370      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
18371      * improve efficiency. */
18372
18373     if (optimizable) {
18374         PERL_UINT_FAST8_T i;
18375         Size_t partial_cp_count = 0;
18376         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18377         UV   end[MAX_FOLD_FROMS+1] = { 0 };
18378
18379         if (cp_list) { /* Count the code points in enough ranges that we would
18380                           see all the ones possible in any fold in this version
18381                           of Unicode */
18382
18383             invlist_iterinit(cp_list);
18384             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18385                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
18386                     break;
18387                 }
18388                 partial_cp_count += end[i] - start[i] + 1;
18389             }
18390
18391             invlist_iterfinish(cp_list);
18392         }
18393
18394         /* If we know at compile time that this matches every possible code
18395          * point, any run-time dependencies don't matter */
18396         if (start[0] == 0 && end[0] == UV_MAX) {
18397             if (invert) {
18398                 ret = reganode(pRExC_state, OPFAIL, 0);
18399             }
18400             else {
18401                 ret = reg_node(pRExC_state, SANY);
18402                 MARK_NAUGHTY(1);
18403             }
18404             goto not_anyof;
18405         }
18406
18407         /* Similarly, for /l posix classes, if both a class and its
18408          * complement match, any run-time dependencies don't matter */
18409         if (posixl) {
18410             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18411                                                         namedclass += 2)
18412             {
18413                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
18414                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18415                 {
18416                     if (invert) {
18417                         ret = reganode(pRExC_state, OPFAIL, 0);
18418                     }
18419                     else {
18420                         ret = reg_node(pRExC_state, SANY);
18421                         MARK_NAUGHTY(1);
18422                     }
18423                     goto not_anyof;
18424                 }
18425             }
18426             /* For well-behaved locales, some classes are subsets of others,
18427              * so complementing the subset and including the non-complemented
18428              * superset should match everything, like [\D[:alnum:]], and
18429              * [[:^alpha:][:alnum:]], but some implementations of locales are
18430              * buggy, and khw thinks its a bad idea to have optimization change
18431              * behavior, even if it avoids an OS bug in a given case */
18432
18433 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18434
18435             /* If is a single posix /l class, can optimize to just that op.
18436              * Such a node will not match anything in the Latin1 range, as that
18437              * is not determinable until runtime, but will match whatever the
18438              * class does outside that range.  (Note that some classes won't
18439              * match anything outside the range, like [:ascii:]) */
18440             if (    isSINGLE_BIT_SET(posixl)
18441                 && (partial_cp_count == 0 || start[0] > 255))
18442             {
18443                 U8 classnum;
18444                 SV * class_above_latin1 = NULL;
18445                 bool already_inverted;
18446                 bool are_equivalent;
18447
18448                 /* Compute which bit is set, which is the same thing as, e.g.,
18449                  * ANYOF_CNTRL.  From
18450                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18451                  * */
18452                 static const int MultiplyDeBruijnBitPosition2[32] =
18453                     {
18454                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18455                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18456                     };
18457
18458                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18459                                                           * 0x077CB531U) >> 27];
18460                 classnum = namedclass_to_classnum(namedclass);
18461
18462                 /* The named classes are such that the inverted number is one
18463                  * larger than the non-inverted one */
18464                 already_inverted = namedclass
18465                                  - classnum_to_namedclass(classnum);
18466
18467                 /* Create an inversion list of the official property, inverted
18468                  * if the constructed node list is inverted, and restricted to
18469                  * only the above latin1 code points, which are the only ones
18470                  * known at compile time */
18471                 _invlist_intersection_maybe_complement_2nd(
18472                                                     PL_AboveLatin1,
18473                                                     PL_XPosix_ptrs[classnum],
18474                                                     already_inverted,
18475                                                     &class_above_latin1);
18476                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18477                                                                         FALSE);
18478                 SvREFCNT_dec_NN(class_above_latin1);
18479
18480                 if (are_equivalent) {
18481
18482                     /* Resolve the run-time inversion flag with this possibly
18483                      * inverted class */
18484                     invert = invert ^ already_inverted;
18485
18486                     ret = reg_node(pRExC_state,
18487                                    POSIXL + invert * (NPOSIXL - POSIXL));
18488                     FLAGS(REGNODE_p(ret)) = classnum;
18489                     goto not_anyof;
18490                 }
18491             }
18492         }
18493
18494         /* khw can't think of any other possible transformation involving
18495          * these. */
18496         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18497             goto is_anyof;
18498         }
18499
18500         if (! has_runtime_dependency) {
18501
18502             /* If the list is empty, nothing matches.  This happens, for
18503              * example, when a Unicode property that doesn't match anything is
18504              * the only element in the character class (perluniprops.pod notes
18505              * such properties). */
18506             if (partial_cp_count == 0) {
18507                 if (invert) {
18508                     ret = reg_node(pRExC_state, SANY);
18509                 }
18510                 else {
18511                     ret = reganode(pRExC_state, OPFAIL, 0);
18512                 }
18513
18514                 goto not_anyof;
18515             }
18516
18517             /* If matches everything but \n */
18518             if (   start[0] == 0 && end[0] == '\n' - 1
18519                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18520             {
18521                 assert (! invert);
18522                 ret = reg_node(pRExC_state, REG_ANY);
18523                 MARK_NAUGHTY(1);
18524                 goto not_anyof;
18525             }
18526         }
18527
18528         /* Next see if can optimize classes that contain just a few code points
18529          * into an EXACTish node.  The reason to do this is to let the
18530          * optimizer join this node with adjacent EXACTish ones.
18531          *
18532          * An EXACTFish node can be generated even if not under /i, and vice
18533          * versa.  But care must be taken.  An EXACTFish node has to be such
18534          * that it only matches precisely the code points in the class, but we
18535          * want to generate the least restrictive one that does that, to
18536          * increase the odds of being able to join with an adjacent node.  For
18537          * example, if the class contains [kK], we have to make it an EXACTFAA
18538          * node to prevent the KELVIN SIGN from matching.  Whether we are under
18539          * /i or not is irrelevant in this case.  Less obvious is the pattern
18540          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
18541          * supposed to match the single character U+0149 LATIN SMALL LETTER N
18542          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
18543          * that includes \X{02BC}, there is a multi-char fold that does, and so
18544          * the node generated for it must be an EXACTFish one.  On the other
18545          * hand qr/:/i should generate a plain EXACT node since the colon
18546          * participates in no fold whatsoever, and having it EXACT tells the
18547          * optimizer the target string cannot match unless it has a colon in
18548          * it.
18549          *
18550          * We don't typically generate an EXACTish node if doing so would
18551          * require changing the pattern to UTF-8, as that affects /d and
18552          * otherwise is slower.  However, under /i, not changing to UTF-8 can
18553          * miss some potential multi-character folds.  We calculate the
18554          * EXACTish node, and then decide if something would be missed if we
18555          * don't upgrade */
18556         if (   ! posixl
18557             && ! invert
18558
18559                 /* Only try if there are no more code points in the class than
18560                  * in the max possible fold */
18561             &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
18562
18563             && (start[0] < 256 || UTF || FOLD))
18564         {
18565             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18566             {
18567                 /* We can always make a single code point class into an
18568                  * EXACTish node. */
18569
18570                 if (LOC) {
18571
18572                     /* Here is /l:  Use EXACTL, except /li indicates EXACTFL,
18573                      * as that means there is a fold not known until runtime so
18574                      * shows as only a single code point here. */
18575                     op = (FOLD) ? EXACTFL : EXACTL;
18576                 }
18577                 else if (! FOLD) { /* Not /l and not /i */
18578                     op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
18579                 }
18580                 else if (start[0] < 256) { /* /i, not /l, and the code point is
18581                                               small */
18582
18583                     /* Under /i, it gets a little tricky.  A code point that
18584                      * doesn't participate in a fold should be an EXACT node.
18585                      * We know this one isn't the result of a simple fold, or
18586                      * there'd be more than one code point in the list, but it
18587                      * could be part of a multi- character fold.  In that case
18588                      * we better not create an EXACT node, as we would wrongly
18589                      * be telling the optimizer that this code point must be in
18590                      * the target string, and that is wrong.  This is because
18591                      * if the sequence around this code point forms a
18592                      * multi-char fold, what needs to be in the string could be
18593                      * the code point that folds to the sequence.
18594                      *
18595                      * This handles the case of below-255 code points, as we
18596                      * have an easy look up for those.  The next clause handles
18597                      * the above-256 one */
18598                     op = IS_IN_SOME_FOLD_L1(start[0])
18599                          ? EXACTFU
18600                          : EXACT;
18601                 }
18602                 else {  /* /i, larger code point.  Since we are under /i, and
18603                            have just this code point, we know that it can't
18604                            fold to something else, so PL_InMultiCharFold
18605                            applies to it */
18606                     op = _invlist_contains_cp(PL_InMultiCharFold,
18607                                               start[0])
18608                          ? EXACTFU_ONLY8
18609                          : EXACT_ONLY8;
18610                 }
18611
18612                 value = start[0];
18613             }
18614             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18615                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
18616             {
18617                 /* Here, the only runtime dependency, if any, is from /d, and
18618                  * the class matches more than one code point, and the lowest
18619                  * code point participates in some fold.  It might be that the
18620                  * other code points are /i equivalent to this one, and hence
18621                  * they would representable by an EXACTFish node.  Above, we
18622                  * eliminated classes that contain too many code points to be
18623                  * EXACTFish, with the test for MAX_FOLD_FROMS
18624                  *
18625                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
18626                  * We do this because we have EXACTFAA at our disposal for the
18627                  * ASCII range */
18628                 if (partial_cp_count == 2 && isASCII(start[0])) {
18629
18630                     /* The only ASCII characters that participate in folds are
18631                      * alphabetics */
18632                     assert(isALPHA(start[0]));
18633                     if (   end[0] == start[0]   /* First range is a single
18634                                                    character, so 2nd exists */
18635                         && isALPHA_FOLD_EQ(start[0], start[1]))
18636                     {
18637
18638                         /* Here, is part of an ASCII fold pair */
18639
18640                         if (   ASCII_FOLD_RESTRICTED
18641                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18642                         {
18643                             /* If the second clause just above was true, it
18644                              * means we can't be under /i, or else the list
18645                              * would have included more than this fold pair.
18646                              * Therefore we have to exclude the possibility of
18647                              * whatever else it is that folds to these, by
18648                              * using EXACTFAA */
18649                             op = EXACTFAA;
18650                         }
18651                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18652
18653                             /* Here, there's no simple fold that start[0] is part
18654                              * of, but there is a multi-character one.  If we
18655                              * are not under /i, we want to exclude that
18656                              * possibility; if under /i, we want to include it
18657                              * */
18658                             op = (FOLD) ? EXACTFU : EXACTFAA;
18659                         }
18660                         else {
18661
18662                             /* Here, the only possible fold start[0] particpates in
18663                              * is with start[1].  /i or not isn't relevant */
18664                             op = EXACTFU;
18665                         }
18666
18667                         value = toFOLD(start[0]);
18668                     }
18669                 }
18670                 else if (  ! upper_latin1_only_utf8_matches
18671                          || (   _invlist_len(upper_latin1_only_utf8_matches)
18672                                                                           == 2
18673                              && PL_fold_latin1[
18674                                invlist_highest(upper_latin1_only_utf8_matches)]
18675                              == start[0]))
18676                 {
18677                     /* Here, the smallest character is non-ascii or there are
18678                      * more than 2 code points matched by this node.  Also, we
18679                      * either don't have /d UTF-8 dependent matches, or if we
18680                      * do, they look like they could be a single character that
18681                      * is the fold of the lowest one in the always-match list.
18682                      * This test quickly excludes most of the false positives
18683                      * when there are /d UTF-8 depdendent matches.  These are
18684                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18685                      * SMALL LETTER A WITH GRAVE iff the target string is
18686                      * UTF-8.  (We don't have to worry above about exceeding
18687                      * the array bounds of PL_fold_latin1[] because any code
18688                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
18689                      *
18690                      * EXACTFAA would apply only to pairs (hence exactly 2 code
18691                      * points) in the ASCII range, so we can't use it here to
18692                      * artificially restrict the fold domain, so we check if
18693                      * the class does or does not match some EXACTFish node.
18694                      * Further, if we aren't under /i, and and the folded-to
18695                      * character is part of a multi-character fold, we can't do
18696                      * this optimization, as the sequence around it could be
18697                      * that multi-character fold, and we don't here know the
18698                      * context, so we have to assume it is that multi-char
18699                      * fold, to prevent potential bugs.
18700                      *
18701                      * To do the general case, we first find the fold of the
18702                      * lowest code point (which may be higher than the lowest
18703                      * one), then find everything that folds to it.  (The data
18704                      * structure we have only maps from the folded code points,
18705                      * so we have to do the earlier step.) */
18706
18707                     Size_t foldlen;
18708                     U8 foldbuf[UTF8_MAXBYTES_CASE];
18709                     UV folded = _to_uni_fold_flags(start[0],
18710                                                         foldbuf, &foldlen, 0);
18711                     unsigned int first_fold;
18712                     const unsigned int * remaining_folds;
18713                     Size_t folds_to_this_cp_count = _inverse_folds(
18714                                                             folded,
18715                                                             &first_fold,
18716                                                             &remaining_folds);
18717                     Size_t folds_count = folds_to_this_cp_count + 1;
18718                     SV * fold_list = _new_invlist(folds_count);
18719                     unsigned int i;
18720
18721                     /* If there are UTF-8 dependent matches, create a temporary
18722                      * list of what this node matches, including them. */
18723                     SV * all_cp_list = NULL;
18724                     SV ** use_this_list = &cp_list;
18725
18726                     if (upper_latin1_only_utf8_matches) {
18727                         all_cp_list = _new_invlist(0);
18728                         use_this_list = &all_cp_list;
18729                         _invlist_union(cp_list,
18730                                        upper_latin1_only_utf8_matches,
18731                                        use_this_list);
18732                     }
18733
18734                     /* Having gotten everything that participates in the fold
18735                      * containing the lowest code point, we turn that into an
18736                      * inversion list, making sure everything is included. */
18737                     fold_list = add_cp_to_invlist(fold_list, start[0]);
18738                     fold_list = add_cp_to_invlist(fold_list, folded);
18739                     if (folds_to_this_cp_count > 0) {
18740                         fold_list = add_cp_to_invlist(fold_list, first_fold);
18741                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
18742                             fold_list = add_cp_to_invlist(fold_list,
18743                                                         remaining_folds[i]);
18744                         }
18745                     }
18746
18747                     /* If the fold list is identical to what's in this ANYOF
18748                      * node, the node can be represented by an EXACTFish one
18749                      * instead */
18750                     if (_invlistEQ(*use_this_list, fold_list,
18751                                    0 /* Don't complement */ )
18752                     ) {
18753
18754                         /* But, we have to be careful, as mentioned above.
18755                          * Just the right sequence of characters could match
18756                          * this if it is part of a multi-character fold.  That
18757                          * IS what we want if we are under /i.  But it ISN'T
18758                          * what we want if not under /i, as it could match when
18759                          * it shouldn't.  So, when we aren't under /i and this
18760                          * character participates in a multi-char fold, we
18761                          * don't optimize into an EXACTFish node.  So, for each
18762                          * case below we have to check if we are folding
18763                          * and if not, if it is not part of a multi-char fold.
18764                          * */
18765                         if (start[0] > 255) {    /* Highish code point */
18766                             if (FOLD || ! _invlist_contains_cp(
18767                                             PL_InMultiCharFold, folded))
18768                             {
18769                                 op = (LOC)
18770                                      ? EXACTFLU8
18771                                      : (ASCII_FOLD_RESTRICTED)
18772                                        ? EXACTFAA
18773                                        : EXACTFU_ONLY8;
18774                                 value = folded;
18775                             }
18776                         }   /* Below, the lowest code point < 256 */
18777                         else if (    FOLD
18778                                  &&  folded == 's'
18779                                  &&  DEPENDS_SEMANTICS)
18780                         {   /* An EXACTF node containing a single character
18781                                 's', can be an EXACTFU if it doesn't get
18782                                 joined with an adjacent 's' */
18783                             op = EXACTFU_S_EDGE;
18784                             value = folded;
18785                         }
18786                         else if (    FOLD
18787                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
18788                         {
18789                             if (upper_latin1_only_utf8_matches) {
18790                                 op = EXACTF;
18791
18792                                 /* We can't use the fold, as that only matches
18793                                  * under UTF-8 */
18794                                 value = start[0];
18795                             }
18796                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
18797                                      && ! UTF)
18798                             {   /* EXACTFUP is a special node for this
18799                                    character */
18800                                 op = (ASCII_FOLD_RESTRICTED)
18801                                      ? EXACTFAA
18802                                      : EXACTFUP;
18803                                 value = MICRO_SIGN;
18804                             }
18805                             else if (     ASCII_FOLD_RESTRICTED
18806                                      && ! isASCII(start[0]))
18807                             {   /* For ASCII under /iaa, we can use EXACTFU
18808                                    below */
18809                                 op = EXACTFAA;
18810                                 value = folded;
18811                             }
18812                             else {
18813                                 op = EXACTFU;
18814                                 value = folded;
18815                             }
18816                         }
18817                     }
18818
18819                     SvREFCNT_dec_NN(fold_list);
18820                     SvREFCNT_dec(all_cp_list);
18821                 }
18822             }
18823
18824             if (op != END) {
18825
18826                 /* Here, we have calculated what EXACTish node we would use.
18827                  * But we don't use it if it would require converting the
18828                  * pattern to UTF-8, unless not using it could cause us to miss
18829                  * some folds (hence be buggy) */
18830
18831                 if (! UTF && value > 255) {
18832                     SV * in_multis = NULL;
18833
18834                     assert(FOLD);
18835
18836                     /* If there is no code point that is part of a multi-char
18837                      * fold, then there aren't any matches, so we don't do this
18838                      * optimization.  Otherwise, it could match depending on
18839                      * the context around us, so we do upgrade */
18840                     _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
18841                     if (UNLIKELY(_invlist_len(in_multis) != 0)) {
18842                         REQUIRE_UTF8(flagp);
18843                     }
18844                     else {
18845                         op = END;
18846                     }
18847                 }
18848
18849                 if (op != END) {
18850                     U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
18851
18852                     ret = regnode_guts(pRExC_state, op, len, "exact");
18853                     FILL_NODE(ret, op);
18854                     RExC_emit += 1 + STR_SZ(len);
18855                     STR_LEN(REGNODE_p(ret)) = len;
18856                     if (len == 1) {
18857                         *STRING(REGNODE_p(ret)) = (U8) value;
18858                     }
18859                     else {
18860                         uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
18861                     }
18862                     goto not_anyof;
18863                 }
18864             }
18865         }
18866
18867         if (! has_runtime_dependency) {
18868
18869             /* See if this can be turned into an ANYOFM node.  Think about the
18870              * bit patterns in two different bytes.  In some positions, the
18871              * bits in each will be 1; and in other positions both will be 0;
18872              * and in some positions the bit will be 1 in one byte, and 0 in
18873              * the other.  Let 'n' be the number of positions where the bits
18874              * differ.  We create a mask which has exactly 'n' 0 bits, each in
18875              * a position where the two bytes differ.  Now take the set of all
18876              * bytes that when ANDed with the mask yield the same result.  That
18877              * set has 2**n elements, and is representable by just two 8 bit
18878              * numbers: the result and the mask.  Importantly, matching the set
18879              * can be vectorized by creating a word full of the result bytes,
18880              * and a word full of the mask bytes, yielding a significant speed
18881              * up.  Here, see if this node matches such a set.  As a concrete
18882              * example consider [01], and the byte representing '0' which is
18883              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
18884              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
18885              * 0x30.  Any other bytes ANDed yield something else.  So [01],
18886              * which is a common usage, is optimizable into ANYOFM, and can
18887              * benefit from the speed up.  We can only do this on UTF-8
18888              * invariant bytes, because they have the same bit patterns under
18889              * UTF-8 as not. */
18890             PERL_UINT_FAST8_T inverted = 0;
18891 #ifdef EBCDIC
18892             const PERL_UINT_FAST8_T max_permissible = 0xFF;
18893 #else
18894             const PERL_UINT_FAST8_T max_permissible = 0x7F;
18895 #endif
18896             /* If doesn't fit the criteria for ANYOFM, invert and try again.
18897              * If that works we will instead later generate an NANYOFM, and
18898              * invert back when through */
18899             if (invlist_highest(cp_list) > max_permissible) {
18900                 _invlist_invert(cp_list);
18901                 inverted = 1;
18902             }
18903
18904             if (invlist_highest(cp_list) <= max_permissible) {
18905                 UV this_start, this_end;
18906                 UV lowest_cp = UV_MAX;  /* inited to suppress compiler warn */
18907                 U8 bits_differing = 0;
18908                 Size_t full_cp_count = 0;
18909                 bool first_time = TRUE;
18910
18911                 /* Go through the bytes and find the bit positions that differ
18912                  * */
18913                 invlist_iterinit(cp_list);
18914                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
18915                     unsigned int i = this_start;
18916
18917                     if (first_time) {
18918                         if (! UVCHR_IS_INVARIANT(i)) {
18919                             goto done_anyofm;
18920                         }
18921
18922                         first_time = FALSE;
18923                         lowest_cp = this_start;
18924
18925                         /* We have set up the code point to compare with.
18926                          * Don't compare it with itself */
18927                         i++;
18928                     }
18929
18930                     /* Find the bit positions that differ from the lowest code
18931                      * point in the node.  Keep track of all such positions by
18932                      * OR'ing */
18933                     for (; i <= this_end; i++) {
18934                         if (! UVCHR_IS_INVARIANT(i)) {
18935                             goto done_anyofm;
18936                         }
18937
18938                         bits_differing  |= i ^ lowest_cp;
18939                     }
18940
18941                     full_cp_count += this_end - this_start + 1;
18942                 }
18943
18944                 /* At the end of the loop, we count how many bits differ from
18945                  * the bits in lowest code point, call the count 'd'.  If the
18946                  * set we found contains 2**d elements, it is the closure of
18947                  * all code points that differ only in those bit positions.  To
18948                  * convince yourself of that, first note that the number in the
18949                  * closure must be a power of 2, which we test for.  The only
18950                  * way we could have that count and it be some differing set,
18951                  * is if we got some code points that don't differ from the
18952                  * lowest code point in any position, but do differ from each
18953                  * other in some other position.  That means one code point has
18954                  * a 1 in that position, and another has a 0.  But that would
18955                  * mean that one of them differs from the lowest code point in
18956                  * that position, which possibility we've already excluded.  */
18957                 if (  (inverted || full_cp_count > 1)
18958                     && full_cp_count == 1U << PL_bitcount[bits_differing])
18959                 {
18960                     U8 ANYOFM_mask;
18961
18962                     op = ANYOFM + inverted;;
18963
18964                     /* We need to make the bits that differ be 0's */
18965                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
18966
18967                     /* The argument is the lowest code point */
18968                     ret = reganode(pRExC_state, op, lowest_cp);
18969                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
18970                 }
18971
18972               done_anyofm:
18973                 invlist_iterfinish(cp_list);
18974             }
18975
18976             if (inverted) {
18977                 _invlist_invert(cp_list);
18978             }
18979
18980             if (op != END) {
18981                 goto not_anyof;
18982             }
18983         }
18984
18985         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
18986             PERL_UINT_FAST8_T type;
18987             SV * intersection = NULL;
18988             SV* d_invlist = NULL;
18989
18990             /* See if this matches any of the POSIX classes.  The POSIXA and
18991              * POSIXD ones are about the same speed as ANYOF ops, but take less
18992              * room; the ones that have above-Latin1 code point matches are
18993              * somewhat faster than ANYOF.  */
18994
18995             for (type = POSIXA; type >= POSIXD; type--) {
18996                 int posix_class;
18997
18998                 if (type == POSIXL) {   /* But not /l posix classes */
18999                     continue;
19000                 }
19001
19002                 for (posix_class = 0;
19003                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19004                      posix_class++)
19005                 {
19006                     SV** our_code_points = &cp_list;
19007                     SV** official_code_points;
19008                     int try_inverted;
19009
19010                     if (type == POSIXA) {
19011                         official_code_points = &PL_Posix_ptrs[posix_class];
19012                     }
19013                     else {
19014                         official_code_points = &PL_XPosix_ptrs[posix_class];
19015                     }
19016
19017                     /* Skip non-existent classes of this type.  e.g. \v only
19018                      * has an entry in PL_XPosix_ptrs */
19019                     if (! *official_code_points) {
19020                         continue;
19021                     }
19022
19023                     /* Try both the regular class, and its inversion */
19024                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19025                         bool this_inverted = invert ^ try_inverted;
19026
19027                         if (type != POSIXD) {
19028
19029                             /* This class that isn't /d can't match if we have
19030                              * /d dependencies */
19031                             if (has_runtime_dependency
19032                                                     & HAS_D_RUNTIME_DEPENDENCY)
19033                             {
19034                                 continue;
19035                             }
19036                         }
19037                         else /* is /d */ if (! this_inverted) {
19038
19039                             /* /d classes don't match anything non-ASCII below
19040                              * 256 unconditionally (which cp_list contains) */
19041                             _invlist_intersection(cp_list, PL_UpperLatin1,
19042                                                            &intersection);
19043                             if (_invlist_len(intersection) != 0) {
19044                                 continue;
19045                             }
19046
19047                             SvREFCNT_dec(d_invlist);
19048                             d_invlist = invlist_clone(cp_list, NULL);
19049
19050                             /* But under UTF-8 it turns into using /u rules.
19051                              * Add the things it matches under these conditions
19052                              * so that we check below that these are identical
19053                              * to what the tested class should match */
19054                             if (upper_latin1_only_utf8_matches) {
19055                                 _invlist_union(
19056                                             d_invlist,
19057                                             upper_latin1_only_utf8_matches,
19058                                             &d_invlist);
19059                             }
19060                             our_code_points = &d_invlist;
19061                         }
19062                         else {  /* POSIXD, inverted.  If this doesn't have this
19063                                    flag set, it isn't /d. */
19064                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19065                             {
19066                                 continue;
19067                             }
19068                             our_code_points = &cp_list;
19069                         }
19070
19071                         /* Here, have weeded out some things.  We want to see
19072                          * if the list of characters this node contains
19073                          * ('*our_code_points') precisely matches those of the
19074                          * class we are currently checking against
19075                          * ('*official_code_points'). */
19076                         if (_invlistEQ(*our_code_points,
19077                                        *official_code_points,
19078                                        try_inverted))
19079                         {
19080                             /* Here, they precisely match.  Optimize this ANYOF
19081                              * node into its equivalent POSIX one of the
19082                              * correct type, possibly inverted */
19083                             ret = reg_node(pRExC_state, (try_inverted)
19084                                                         ? type + NPOSIXA
19085                                                                 - POSIXA
19086                                                         : type);
19087                             FLAGS(REGNODE_p(ret)) = posix_class;
19088                             SvREFCNT_dec(d_invlist);
19089                             SvREFCNT_dec(intersection);
19090                             goto not_anyof;
19091                         }
19092                     }
19093                 }
19094             }
19095             SvREFCNT_dec(d_invlist);
19096             SvREFCNT_dec(intersection);
19097         }
19098
19099         /* If didn't find an optimization and there is no need for a bitmap,
19100          * optimize to indicate that */
19101         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19102             && ! LOC
19103             && ! upper_latin1_only_utf8_matches
19104             &&   anyof_flags == 0)
19105         {
19106             U8 low_utf8[UTF8_MAXBYTES+1];
19107             UV highest_cp = invlist_highest(cp_list);
19108
19109             op = ANYOFH;
19110
19111             /* Currently the maximum allowed code point by the system is
19112              * IV_MAX.  Higher ones are reserved for future internal use.  This
19113              * particular regnode can be used for higher ones, but we can't
19114              * calculate the code point of those.  IV_MAX suffices though, as
19115              * it will be a large first byte */
19116             (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
19117
19118             /* We store the lowest possible first byte of the UTF-8
19119              * representation, using the flags field.  This allows for quick
19120              * ruling out of some inputs without having to convert from UTF-8
19121              * to code point.  For EBCDIC, this has to be I8. */
19122             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19123
19124             /* If the first UTF-8 start byte for the highest code point in the
19125              * range is suitably small, we may be able to get an upper bound as
19126              * well */
19127             if (highest_cp <= IV_MAX) {
19128                 U8 high_utf8[UTF8_MAXBYTES+1];
19129
19130                 (void) uvchr_to_utf8(high_utf8, highest_cp);
19131
19132                 /* If the lowest and highest are the same, we can get an exact
19133                  * first byte instead of a just minimum.  We signal this with a
19134                  * different regnode */
19135                 if (low_utf8[0] == high_utf8[0]) {
19136
19137                     /* No need to convert to I8 for EBCDIC as this is an exact
19138                      * match */
19139                     anyof_flags = low_utf8[0];
19140                     op = ANYOFHb;
19141                 }
19142                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19143                 {
19144
19145                     /* Here, the high byte is not the same as the low, but is
19146                      * small enough that its reasonable to have a loose upper
19147                      * bound, which is packed in with the strict lower bound.
19148                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19149                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
19150                      * is the same thing as UTF-8 */
19151
19152                     U8 bits = 0;
19153                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19154                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19155                                   - anyof_flags;
19156
19157                     if (range_diff <= max_range_diff / 8) {
19158                         bits = 3;
19159                     }
19160                     else if (range_diff <= max_range_diff / 4) {
19161                         bits = 2;
19162                     }
19163                     else if (range_diff <= max_range_diff / 2) {
19164                         bits = 1;
19165                     }
19166                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19167                     op = ANYOFHr;
19168                 }
19169             }
19170
19171             goto done_finding_op;
19172         }
19173     }   /* End of seeing if can optimize it into a different node */
19174
19175   is_anyof: /* It's going to be an ANYOF node. */
19176     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19177          ? ANYOFD
19178          : ((posixl)
19179             ? ANYOFPOSIXL
19180             : ((LOC)
19181                ? ANYOFL
19182                : ANYOF));
19183
19184   done_finding_op:
19185
19186     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19187     FILL_NODE(ret, op);        /* We set the argument later */
19188     RExC_emit += 1 + regarglen[op];
19189     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19190
19191     /* Here, <cp_list> contains all the code points we can determine at
19192      * compile time that match under all conditions.  Go through it, and
19193      * for things that belong in the bitmap, put them there, and delete from
19194      * <cp_list>.  While we are at it, see if everything above 255 is in the
19195      * list, and if so, set a flag to speed up execution */
19196
19197     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19198
19199     if (posixl) {
19200         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19201     }
19202
19203     if (invert) {
19204         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19205     }
19206
19207     /* Here, the bitmap has been populated with all the Latin1 code points that
19208      * always match.  Can now add to the overall list those that match only
19209      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19210      * */
19211     if (upper_latin1_only_utf8_matches) {
19212         if (cp_list) {
19213             _invlist_union(cp_list,
19214                            upper_latin1_only_utf8_matches,
19215                            &cp_list);
19216             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19217         }
19218         else {
19219             cp_list = upper_latin1_only_utf8_matches;
19220         }
19221         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19222     }
19223
19224     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19225                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19226                    ? listsv : NULL,
19227                   only_utf8_locale_list);
19228     return ret;
19229
19230   not_anyof:
19231
19232     /* Here, the node is getting optimized into something that's not an ANYOF
19233      * one.  Finish up. */
19234
19235     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19236                                            RExC_parse - orig_parse);;
19237     SvREFCNT_dec(cp_list);;
19238     return ret;
19239 }
19240
19241 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19242
19243 STATIC void
19244 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19245                 regnode* const node,
19246                 SV* const cp_list,
19247                 SV* const runtime_defns,
19248                 SV* const only_utf8_locale_list)
19249 {
19250     /* Sets the arg field of an ANYOF-type node 'node', using information about
19251      * the node passed-in.  If there is nothing outside the node's bitmap, the
19252      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19253      * the count returned by add_data(), having allocated and stored an array,
19254      * av, as follows:
19255      *
19256      *  av[0] stores the inversion list defining this class as far as known at
19257      *        this time, or PL_sv_undef if nothing definite is now known.
19258      *  av[1] stores the inversion list of code points that match only if the
19259      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
19260      *        av[2], or no entry otherwise.
19261      *  av[2] stores the list of user-defined properties whose subroutine
19262      *        definitions aren't known at this time, or no entry if none. */
19263
19264     UV n;
19265
19266     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19267
19268     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19269         assert(! (ANYOF_FLAGS(node)
19270                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19271         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19272     }
19273     else {
19274         AV * const av = newAV();
19275         SV *rv;
19276
19277         if (cp_list) {
19278             av_store(av, INVLIST_INDEX, cp_list);
19279         }
19280
19281         if (only_utf8_locale_list) {
19282             av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
19283         }
19284
19285         if (runtime_defns) {
19286             av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
19287         }
19288
19289         rv = newRV_noinc(MUTABLE_SV(av));
19290         n = add_data(pRExC_state, STR_WITH_LEN("s"));
19291         RExC_rxi->data->data[n] = (void*)rv;
19292         ARG_SET(node, n);
19293     }
19294 }
19295
19296 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19297 SV *
19298 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19299                                         const regnode* node,
19300                                         bool doinit,
19301                                         SV** listsvp,
19302                                         SV** only_utf8_locale_ptr,
19303                                         SV** output_invlist)
19304
19305 {
19306     /* For internal core use only.
19307      * Returns the inversion list for the input 'node' in the regex 'prog'.
19308      * If <doinit> is 'true', will attempt to create the inversion list if not
19309      *    already done.
19310      * If <listsvp> is non-null, will return the printable contents of the
19311      *    property definition.  This can be used to get debugging information
19312      *    even before the inversion list exists, by calling this function with
19313      *    'doinit' set to false, in which case the components that will be used
19314      *    to eventually create the inversion list are returned  (in a printable
19315      *    form).
19316      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19317      *    store an inversion list of code points that should match only if the
19318      *    execution-time locale is a UTF-8 one.
19319      * If <output_invlist> is not NULL, it is where this routine is to store an
19320      *    inversion list of the code points that would be instead returned in
19321      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
19322      *    when this parameter is used, is just the non-code point data that
19323      *    will go into creating the inversion list.  This currently should be just
19324      *    user-defined properties whose definitions were not known at compile
19325      *    time.  Using this parameter allows for easier manipulation of the
19326      *    inversion list's data by the caller.  It is illegal to call this
19327      *    function with this parameter set, but not <listsvp>
19328      *
19329      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
19330      * that, in spite of this function's name, the inversion list it returns
19331      * may include the bitmap data as well */
19332
19333     SV *si  = NULL;         /* Input initialization string */
19334     SV* invlist = NULL;
19335
19336     RXi_GET_DECL(prog, progi);
19337     const struct reg_data * const data = prog ? progi->data : NULL;
19338
19339     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19340     assert(! output_invlist || listsvp);
19341
19342     if (data && data->count) {
19343         const U32 n = ARG(node);
19344
19345         if (data->what[n] == 's') {
19346             SV * const rv = MUTABLE_SV(data->data[n]);
19347             AV * const av = MUTABLE_AV(SvRV(rv));
19348             SV **const ary = AvARRAY(av);
19349
19350             invlist = ary[INVLIST_INDEX];
19351
19352             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
19353                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
19354             }
19355
19356             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
19357                 si = ary[DEFERRED_USER_DEFINED_INDEX];
19358             }
19359
19360             if (doinit && (si || invlist)) {
19361                 if (si) {
19362                     bool user_defined;
19363                     SV * msg = newSVpvs_flags("", SVs_TEMP);
19364
19365                     SV * prop_definition = handle_user_defined_property(
19366                             "", 0, FALSE,   /* There is no \p{}, \P{} */
19367                             SvPVX_const(si)[1] - '0',   /* /i or not has been
19368                                                            stored here for just
19369                                                            this occasion */
19370                             TRUE,           /* run time */
19371                             FALSE,          /* This call must find the defn */
19372                             si,             /* The property definition  */
19373                             &user_defined,
19374                             msg,
19375                             0               /* base level call */
19376                            );
19377
19378                     if (SvCUR(msg)) {
19379                         assert(prop_definition == NULL);
19380
19381                         Perl_croak(aTHX_ "%" UTF8f,
19382                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
19383                     }
19384
19385                     if (invlist) {
19386                         _invlist_union(invlist, prop_definition, &invlist);
19387                         SvREFCNT_dec_NN(prop_definition);
19388                     }
19389                     else {
19390                         invlist = prop_definition;
19391                     }
19392
19393                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
19394                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
19395
19396                     av_store(av, INVLIST_INDEX, invlist);
19397                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
19398                                  ? ONLY_LOCALE_MATCHES_INDEX:
19399                                  INVLIST_INDEX);
19400                     si = NULL;
19401                 }
19402             }
19403         }
19404     }
19405
19406     /* If requested, return a printable version of what this ANYOF node matches
19407      * */
19408     if (listsvp) {
19409         SV* matches_string = NULL;
19410
19411         /* This function can be called at compile-time, before everything gets
19412          * resolved, in which case we return the currently best available
19413          * information, which is the string that will eventually be used to do
19414          * that resolving, 'si' */
19415         if (si) {
19416             /* Here, we only have 'si' (and possibly some passed-in data in
19417              * 'invlist', which is handled below)  If the caller only wants
19418              * 'si', use that.  */
19419             if (! output_invlist) {
19420                 matches_string = newSVsv(si);
19421             }
19422             else {
19423                 /* But if the caller wants an inversion list of the node, we
19424                  * need to parse 'si' and place as much as possible in the
19425                  * desired output inversion list, making 'matches_string' only
19426                  * contain the currently unresolvable things */
19427                 const char *si_string = SvPVX(si);
19428                 STRLEN remaining = SvCUR(si);
19429                 UV prev_cp = 0;
19430                 U8 count = 0;
19431
19432                 /* Ignore everything before the first new-line */
19433                 while (*si_string != '\n' && remaining > 0) {
19434                     si_string++;
19435                     remaining--;
19436                 }
19437                 assert(remaining > 0);
19438
19439                 si_string++;
19440                 remaining--;
19441
19442                 while (remaining > 0) {
19443
19444                     /* The data consists of just strings defining user-defined
19445                      * property names, but in prior incarnations, and perhaps
19446                      * somehow from pluggable regex engines, it could still
19447                      * hold hex code point definitions.  Each component of a
19448                      * range would be separated by a tab, and each range by a
19449                      * new-line.  If these are found, instead add them to the
19450                      * inversion list */
19451                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
19452                                      |PERL_SCAN_SILENT_NON_PORTABLE;
19453                     STRLEN len = remaining;
19454                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19455
19456                     /* If the hex decode routine found something, it should go
19457                      * up to the next \n */
19458                     if (   *(si_string + len) == '\n') {
19459                         if (count) {    /* 2nd code point on line */
19460                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19461                         }
19462                         else {
19463                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19464                         }
19465                         count = 0;
19466                         goto prepare_for_next_iteration;
19467                     }
19468
19469                     /* If the hex decode was instead for the lower range limit,
19470                      * save it, and go parse the upper range limit */
19471                     if (*(si_string + len) == '\t') {
19472                         assert(count == 0);
19473
19474                         prev_cp = cp;
19475                         count = 1;
19476                       prepare_for_next_iteration:
19477                         si_string += len + 1;
19478                         remaining -= len + 1;
19479                         continue;
19480                     }
19481
19482                     /* Here, didn't find a legal hex number.  Just add it from
19483                      * here to the next \n */
19484
19485                     remaining -= len;
19486                     while (*(si_string + len) != '\n' && remaining > 0) {
19487                         remaining--;
19488                         len++;
19489                     }
19490                     if (*(si_string + len) == '\n') {
19491                         len++;
19492                         remaining--;
19493                     }
19494                     if (matches_string) {
19495                         sv_catpvn(matches_string, si_string, len - 1);
19496                     }
19497                     else {
19498                         matches_string = newSVpvn(si_string, len - 1);
19499                     }
19500                     si_string += len;
19501                     sv_catpvs(matches_string, " ");
19502                 } /* end of loop through the text */
19503
19504                 assert(matches_string);
19505                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
19506                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19507                 }
19508             } /* end of has an 'si' */
19509         }
19510
19511         /* Add the stuff that's already known */
19512         if (invlist) {
19513
19514             /* Again, if the caller doesn't want the output inversion list, put
19515              * everything in 'matches-string' */
19516             if (! output_invlist) {
19517                 if ( ! matches_string) {
19518                     matches_string = newSVpvs("\n");
19519                 }
19520                 sv_catsv(matches_string, invlist_contents(invlist,
19521                                                   TRUE /* traditional style */
19522                                                   ));
19523             }
19524             else if (! *output_invlist) {
19525                 *output_invlist = invlist_clone(invlist, NULL);
19526             }
19527             else {
19528                 _invlist_union(*output_invlist, invlist, output_invlist);
19529             }
19530         }
19531
19532         *listsvp = matches_string;
19533     }
19534
19535     return invlist;
19536 }
19537 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19538
19539 /* reg_skipcomment()
19540
19541    Absorbs an /x style # comment from the input stream,
19542    returning a pointer to the first character beyond the comment, or if the
19543    comment terminates the pattern without anything following it, this returns
19544    one past the final character of the pattern (in other words, RExC_end) and
19545    sets the REG_RUN_ON_COMMENT_SEEN flag.
19546
19547    Note it's the callers responsibility to ensure that we are
19548    actually in /x mode
19549
19550 */
19551
19552 PERL_STATIC_INLINE char*
19553 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19554 {
19555     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19556
19557     assert(*p == '#');
19558
19559     while (p < RExC_end) {
19560         if (*(++p) == '\n') {
19561             return p+1;
19562         }
19563     }
19564
19565     /* we ran off the end of the pattern without ending the comment, so we have
19566      * to add an \n when wrapping */
19567     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19568     return p;
19569 }
19570
19571 STATIC void
19572 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19573                                 char ** p,
19574                                 const bool force_to_xmod
19575                          )
19576 {
19577     /* If the text at the current parse position '*p' is a '(?#...)' comment,
19578      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19579      * is /x whitespace, advance '*p' so that on exit it points to the first
19580      * byte past all such white space and comments */
19581
19582     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19583
19584     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19585
19586     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19587
19588     for (;;) {
19589         if (RExC_end - (*p) >= 3
19590             && *(*p)     == '('
19591             && *(*p + 1) == '?'
19592             && *(*p + 2) == '#')
19593         {
19594             while (*(*p) != ')') {
19595                 if ((*p) == RExC_end)
19596                     FAIL("Sequence (?#... not terminated");
19597                 (*p)++;
19598             }
19599             (*p)++;
19600             continue;
19601         }
19602
19603         if (use_xmod) {
19604             const char * save_p = *p;
19605             while ((*p) < RExC_end) {
19606                 STRLEN len;
19607                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19608                     (*p) += len;
19609                 }
19610                 else if (*(*p) == '#') {
19611                     (*p) = reg_skipcomment(pRExC_state, (*p));
19612                 }
19613                 else {
19614                     break;
19615                 }
19616             }
19617             if (*p != save_p) {
19618                 continue;
19619             }
19620         }
19621
19622         break;
19623     }
19624
19625     return;
19626 }
19627
19628 /* nextchar()
19629
19630    Advances the parse position by one byte, unless that byte is the beginning
19631    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
19632    those two cases, the parse position is advanced beyond all such comments and
19633    white space.
19634
19635    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
19636 */
19637
19638 STATIC void
19639 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
19640 {
19641     PERL_ARGS_ASSERT_NEXTCHAR;
19642
19643     if (RExC_parse < RExC_end) {
19644         assert(   ! UTF
19645                || UTF8_IS_INVARIANT(*RExC_parse)
19646                || UTF8_IS_START(*RExC_parse));
19647
19648         RExC_parse += (UTF)
19649                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
19650                       : 1;
19651
19652         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
19653                                 FALSE /* Don't force /x */ );
19654     }
19655 }
19656
19657 STATIC void
19658 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
19659 {
19660     /* 'size' is the delta to add or subtract from the current memory allocated
19661      * to the regex engine being constructed */
19662
19663     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
19664
19665     RExC_size += size;
19666
19667     Renewc(RExC_rxi,
19668            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
19669                                                 /* +1 for REG_MAGIC */
19670            char,
19671            regexp_internal);
19672     if ( RExC_rxi == NULL )
19673         FAIL("Regexp out of space");
19674     RXi_SET(RExC_rx, RExC_rxi);
19675
19676     RExC_emit_start = RExC_rxi->program;
19677     if (size > 0) {
19678         Zero(REGNODE_p(RExC_emit), size, regnode);
19679     }
19680
19681 #ifdef RE_TRACK_PATTERN_OFFSETS
19682     Renew(RExC_offsets, 2*RExC_size+1, U32);
19683     if (size > 0) {
19684         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
19685     }
19686     RExC_offsets[0] = RExC_size;
19687 #endif
19688 }
19689
19690 STATIC regnode_offset
19691 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
19692 {
19693     /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
19694      * and increments RExC_size and RExC_emit
19695      *
19696      * It returns the regnode's offset into the regex engine program */
19697
19698     const regnode_offset ret = RExC_emit;
19699
19700     GET_RE_DEBUG_FLAGS_DECL;
19701
19702     PERL_ARGS_ASSERT_REGNODE_GUTS;
19703
19704     SIZE_ALIGN(RExC_size);
19705     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
19706     NODE_ALIGN_FILL(REGNODE_p(ret));
19707 #ifndef RE_TRACK_PATTERN_OFFSETS
19708     PERL_UNUSED_ARG(name);
19709     PERL_UNUSED_ARG(op);
19710 #else
19711     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
19712
19713     if (RExC_offsets) {         /* MJD */
19714         MJD_OFFSET_DEBUG(
19715               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
19716               name, __LINE__,
19717               PL_reg_name[op],
19718               (UV)(RExC_emit) > RExC_offsets[0]
19719                 ? "Overwriting end of array!\n" : "OK",
19720               (UV)(RExC_emit),
19721               (UV)(RExC_parse - RExC_start),
19722               (UV)RExC_offsets[0]));
19723         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
19724     }
19725 #endif
19726     return(ret);
19727 }
19728
19729 /*
19730 - reg_node - emit a node
19731 */
19732 STATIC regnode_offset /* Location. */
19733 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
19734 {
19735     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
19736     regnode_offset ptr = ret;
19737
19738     PERL_ARGS_ASSERT_REG_NODE;
19739
19740     assert(regarglen[op] == 0);
19741
19742     FILL_ADVANCE_NODE(ptr, op);
19743     RExC_emit = ptr;
19744     return(ret);
19745 }
19746
19747 /*
19748 - reganode - emit a node with an argument
19749 */
19750 STATIC regnode_offset /* Location. */
19751 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
19752 {
19753     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
19754     regnode_offset ptr = ret;
19755
19756     PERL_ARGS_ASSERT_REGANODE;
19757
19758     /* ANYOF are special cased to allow non-length 1 args */
19759     assert(regarglen[op] == 1);
19760
19761     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
19762     RExC_emit = ptr;
19763     return(ret);
19764 }
19765
19766 STATIC regnode_offset
19767 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
19768 {
19769     /* emit a node with U32 and I32 arguments */
19770
19771     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
19772     regnode_offset ptr = ret;
19773
19774     PERL_ARGS_ASSERT_REG2LANODE;
19775
19776     assert(regarglen[op] == 2);
19777
19778     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
19779     RExC_emit = ptr;
19780     return(ret);
19781 }
19782
19783 /*
19784 - reginsert - insert an operator in front of already-emitted operand
19785 *
19786 * That means that on exit 'operand' is the offset of the newly inserted
19787 * operator, and the original operand has been relocated.
19788 *
19789 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
19790 * set up NEXT_OFF() of the inserted node if needed. Something like this:
19791 *
19792 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
19793 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
19794 *
19795 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
19796 */
19797 STATIC void
19798 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
19799                   const regnode_offset operand, const U32 depth)
19800 {
19801     regnode *src;
19802     regnode *dst;
19803     regnode *place;
19804     const int offset = regarglen[(U8)op];
19805     const int size = NODE_STEP_REGNODE + offset;
19806     GET_RE_DEBUG_FLAGS_DECL;
19807
19808     PERL_ARGS_ASSERT_REGINSERT;
19809     PERL_UNUSED_CONTEXT;
19810     PERL_UNUSED_ARG(depth);
19811 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
19812     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
19813     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
19814                                     studying. If this is wrong then we need to adjust RExC_recurse
19815                                     below like we do with RExC_open_parens/RExC_close_parens. */
19816     change_engine_size(pRExC_state, (Ptrdiff_t) size);
19817     src = REGNODE_p(RExC_emit);
19818     RExC_emit += size;
19819     dst = REGNODE_p(RExC_emit);
19820
19821     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
19822      * and [perl #133871] shows this can lead to problems, so skip this
19823      * realignment of parens until a later pass when they are reliable */
19824     if (! IN_PARENS_PASS && RExC_open_parens) {
19825         int paren;
19826         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
19827         /* remember that RExC_npar is rex->nparens + 1,
19828          * iow it is 1 more than the number of parens seen in
19829          * the pattern so far. */
19830         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
19831             /* note, RExC_open_parens[0] is the start of the
19832              * regex, it can't move. RExC_close_parens[0] is the end
19833              * of the regex, it *can* move. */
19834             if ( paren && RExC_open_parens[paren] >= operand ) {
19835                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
19836                 RExC_open_parens[paren] += size;
19837             } else {
19838                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
19839             }
19840             if ( RExC_close_parens[paren] >= operand ) {
19841                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
19842                 RExC_close_parens[paren] += size;
19843             } else {
19844                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
19845             }
19846         }
19847     }
19848     if (RExC_end_op)
19849         RExC_end_op += size;
19850
19851     while (src > REGNODE_p(operand)) {
19852         StructCopy(--src, --dst, regnode);
19853 #ifdef RE_TRACK_PATTERN_OFFSETS
19854         if (RExC_offsets) {     /* MJD 20010112 */
19855             MJD_OFFSET_DEBUG(
19856                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
19857                   "reginsert",
19858                   __LINE__,
19859                   PL_reg_name[op],
19860                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
19861                     ? "Overwriting end of array!\n" : "OK",
19862                   (UV)REGNODE_OFFSET(src),
19863                   (UV)REGNODE_OFFSET(dst),
19864                   (UV)RExC_offsets[0]));
19865             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
19866             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
19867         }
19868 #endif
19869     }
19870
19871     place = REGNODE_p(operand); /* Op node, where operand used to be. */
19872 #ifdef RE_TRACK_PATTERN_OFFSETS
19873     if (RExC_offsets) {         /* MJD */
19874         MJD_OFFSET_DEBUG(
19875               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
19876               "reginsert",
19877               __LINE__,
19878               PL_reg_name[op],
19879               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
19880               ? "Overwriting end of array!\n" : "OK",
19881               (UV)REGNODE_OFFSET(place),
19882               (UV)(RExC_parse - RExC_start),
19883               (UV)RExC_offsets[0]));
19884         Set_Node_Offset(place, RExC_parse);
19885         Set_Node_Length(place, 1);
19886     }
19887 #endif
19888     src = NEXTOPER(place);
19889     FLAGS(place) = 0;
19890     FILL_NODE(operand, op);
19891
19892     /* Zero out any arguments in the new node */
19893     Zero(src, offset, regnode);
19894 }
19895
19896 /*
19897 - regtail - set the next-pointer at the end of a node chain of p to val.  If
19898             that value won't fit in the space available, instead returns FALSE.
19899             (Except asserts if we can't fit in the largest space the regex
19900             engine is designed for.)
19901 - SEE ALSO: regtail_study
19902 */
19903 STATIC bool
19904 S_regtail(pTHX_ RExC_state_t * pRExC_state,
19905                 const regnode_offset p,
19906                 const regnode_offset val,
19907                 const U32 depth)
19908 {
19909     regnode_offset scan;
19910     GET_RE_DEBUG_FLAGS_DECL;
19911
19912     PERL_ARGS_ASSERT_REGTAIL;
19913 #ifndef DEBUGGING
19914     PERL_UNUSED_ARG(depth);
19915 #endif
19916
19917     /* Find last node. */
19918     scan = (regnode_offset) p;
19919     for (;;) {
19920         regnode * const temp = regnext(REGNODE_p(scan));
19921         DEBUG_PARSE_r({
19922             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
19923             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19924             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
19925                 SvPV_nolen_const(RExC_mysv), scan,
19926                     (temp == NULL ? "->" : ""),
19927                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
19928             );
19929         });
19930         if (temp == NULL)
19931             break;
19932         scan = REGNODE_OFFSET(temp);
19933     }
19934
19935     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19936         assert((UV) (val - scan) <= U32_MAX);
19937         ARG_SET(REGNODE_p(scan), val - scan);
19938     }
19939     else {
19940         if (val - scan > U16_MAX) {
19941             /* Populate this with something that won't loop and will likely
19942              * lead to a crash if the caller ignores the failure return, and
19943              * execution continues */
19944             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
19945             return FALSE;
19946         }
19947         NEXT_OFF(REGNODE_p(scan)) = val - scan;
19948     }
19949
19950     return TRUE;
19951 }
19952
19953 #ifdef DEBUGGING
19954 /*
19955 - regtail_study - set the next-pointer at the end of a node chain of p to val.
19956 - Look for optimizable sequences at the same time.
19957 - currently only looks for EXACT chains.
19958
19959 This is experimental code. The idea is to use this routine to perform
19960 in place optimizations on branches and groups as they are constructed,
19961 with the long term intention of removing optimization from study_chunk so
19962 that it is purely analytical.
19963
19964 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
19965 to control which is which.
19966
19967 This used to return a value that was ignored.  It was a problem that it is
19968 #ifdef'd to be another function that didn't return a value.  khw has changed it
19969 so both currently return a pass/fail return.
19970
19971 */
19972 /* TODO: All four parms should be const */
19973
19974 STATIC bool
19975 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19976                       const regnode_offset val, U32 depth)
19977 {
19978     regnode_offset scan;
19979     U8 exact = PSEUDO;
19980 #ifdef EXPERIMENTAL_INPLACESCAN
19981     I32 min = 0;
19982 #endif
19983     GET_RE_DEBUG_FLAGS_DECL;
19984
19985     PERL_ARGS_ASSERT_REGTAIL_STUDY;
19986
19987
19988     /* Find last node. */
19989
19990     scan = p;
19991     for (;;) {
19992         regnode * const temp = regnext(REGNODE_p(scan));
19993 #ifdef EXPERIMENTAL_INPLACESCAN
19994         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
19995             bool unfolded_multi_char;   /* Unexamined in this routine */
19996             if (join_exact(pRExC_state, scan, &min,
19997                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
19998                 return TRUE; /* Was return EXACT */
19999         }
20000 #endif
20001         if ( exact ) {
20002             switch (OP(REGNODE_p(scan))) {
20003                 case EXACT:
20004                 case EXACT_ONLY8:
20005                 case EXACTL:
20006                 case EXACTF:
20007                 case EXACTFU_S_EDGE:
20008                 case EXACTFAA_NO_TRIE:
20009                 case EXACTFAA:
20010                 case EXACTFU:
20011                 case EXACTFU_ONLY8:
20012                 case EXACTFLU8:
20013                 case EXACTFUP:
20014                 case EXACTFL:
20015                         if( exact == PSEUDO )
20016                             exact= OP(REGNODE_p(scan));
20017                         else if ( exact != OP(REGNODE_p(scan)) )
20018                             exact= 0;
20019                 case NOTHING:
20020                     break;
20021                 default:
20022                     exact= 0;
20023             }
20024         }
20025         DEBUG_PARSE_r({
20026             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20027             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20028             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
20029                 SvPV_nolen_const(RExC_mysv),
20030                 scan,
20031                 PL_reg_name[exact]);
20032         });
20033         if (temp == NULL)
20034             break;
20035         scan = REGNODE_OFFSET(temp);
20036     }
20037     DEBUG_PARSE_r({
20038         DEBUG_PARSE_MSG("");
20039         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20040         Perl_re_printf( aTHX_
20041                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20042                       SvPV_nolen_const(RExC_mysv),
20043                       (IV)val,
20044                       (IV)(val - scan)
20045         );
20046     });
20047     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20048         assert((UV) (val - scan) <= U32_MAX);
20049         ARG_SET(REGNODE_p(scan), val - scan);
20050     }
20051     else {
20052         if (val - scan > U16_MAX) {
20053             /* Populate this with something that won't loop and will likely
20054              * lead to a crash if the caller ignores the failure return, and
20055              * execution continues */
20056             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20057             return FALSE;
20058         }
20059         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20060     }
20061
20062     return TRUE; /* Was 'return exact' */
20063 }
20064 #endif
20065
20066 STATIC SV*
20067 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20068
20069     /* Returns an inversion list of all the code points matched by the
20070      * ANYOFM/NANYOFM node 'n' */
20071
20072     SV * cp_list = _new_invlist(-1);
20073     const U8 lowest = (U8) ARG(n);
20074     unsigned int i;
20075     U8 count = 0;
20076     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20077
20078     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20079
20080     /* Starting with the lowest code point, any code point that ANDed with the
20081      * mask yields the lowest code point is in the set */
20082     for (i = lowest; i <= 0xFF; i++) {
20083         if ((i & FLAGS(n)) == ARG(n)) {
20084             cp_list = add_cp_to_invlist(cp_list, i);
20085             count++;
20086
20087             /* We know how many code points (a power of two) that are in the
20088              * set.  No use looking once we've got that number */
20089             if (count >= needed) break;
20090         }
20091     }
20092
20093     if (OP(n) == NANYOFM) {
20094         _invlist_invert(cp_list);
20095     }
20096     return cp_list;
20097 }
20098
20099 /*
20100  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20101  */
20102 #ifdef DEBUGGING
20103
20104 static void
20105 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20106 {
20107     int bit;
20108     int set=0;
20109
20110     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20111
20112     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20113         if (flags & (1<<bit)) {
20114             if (!set++ && lead)
20115                 Perl_re_printf( aTHX_  "%s", lead);
20116             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20117         }
20118     }
20119     if (lead)  {
20120         if (set)
20121             Perl_re_printf( aTHX_  "\n");
20122         else
20123             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20124     }
20125 }
20126
20127 static void
20128 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20129 {
20130     int bit;
20131     int set=0;
20132     regex_charset cs;
20133
20134     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20135
20136     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20137         if (flags & (1<<bit)) {
20138             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
20139                 continue;
20140             }
20141             if (!set++ && lead)
20142                 Perl_re_printf( aTHX_  "%s", lead);
20143             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20144         }
20145     }
20146     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20147             if (!set++ && lead) {
20148                 Perl_re_printf( aTHX_  "%s", lead);
20149             }
20150             switch (cs) {
20151                 case REGEX_UNICODE_CHARSET:
20152                     Perl_re_printf( aTHX_  "UNICODE");
20153                     break;
20154                 case REGEX_LOCALE_CHARSET:
20155                     Perl_re_printf( aTHX_  "LOCALE");
20156                     break;
20157                 case REGEX_ASCII_RESTRICTED_CHARSET:
20158                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20159                     break;
20160                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20161                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20162                     break;
20163                 default:
20164                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20165                     break;
20166             }
20167     }
20168     if (lead)  {
20169         if (set)
20170             Perl_re_printf( aTHX_  "\n");
20171         else
20172             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20173     }
20174 }
20175 #endif
20176
20177 void
20178 Perl_regdump(pTHX_ const regexp *r)
20179 {
20180 #ifdef DEBUGGING
20181     int i;
20182     SV * const sv = sv_newmortal();
20183     SV *dsv= sv_newmortal();
20184     RXi_GET_DECL(r, ri);
20185     GET_RE_DEBUG_FLAGS_DECL;
20186
20187     PERL_ARGS_ASSERT_REGDUMP;
20188
20189     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20190
20191     /* Header fields of interest. */
20192     for (i = 0; i < 2; i++) {
20193         if (r->substrs->data[i].substr) {
20194             RE_PV_QUOTED_DECL(s, 0, dsv,
20195                             SvPVX_const(r->substrs->data[i].substr),
20196                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20197                             PL_dump_re_max_len);
20198             Perl_re_printf( aTHX_
20199                           "%s %s%s at %" IVdf "..%" UVuf " ",
20200                           i ? "floating" : "anchored",
20201                           s,
20202                           RE_SV_TAIL(r->substrs->data[i].substr),
20203                           (IV)r->substrs->data[i].min_offset,
20204                           (UV)r->substrs->data[i].max_offset);
20205         }
20206         else if (r->substrs->data[i].utf8_substr) {
20207             RE_PV_QUOTED_DECL(s, 1, dsv,
20208                             SvPVX_const(r->substrs->data[i].utf8_substr),
20209                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20210                             30);
20211             Perl_re_printf( aTHX_
20212                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20213                           i ? "floating" : "anchored",
20214                           s,
20215                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20216                           (IV)r->substrs->data[i].min_offset,
20217                           (UV)r->substrs->data[i].max_offset);
20218         }
20219     }
20220
20221     if (r->check_substr || r->check_utf8)
20222         Perl_re_printf( aTHX_
20223                       (const char *)
20224                       (   r->check_substr == r->substrs->data[1].substr
20225                        && r->check_utf8   == r->substrs->data[1].utf8_substr
20226                        ? "(checking floating" : "(checking anchored"));
20227     if (r->intflags & PREGf_NOSCAN)
20228         Perl_re_printf( aTHX_  " noscan");
20229     if (r->extflags & RXf_CHECK_ALL)
20230         Perl_re_printf( aTHX_  " isall");
20231     if (r->check_substr || r->check_utf8)
20232         Perl_re_printf( aTHX_  ") ");
20233
20234     if (ri->regstclass) {
20235         regprop(r, sv, ri->regstclass, NULL, NULL);
20236         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
20237     }
20238     if (r->intflags & PREGf_ANCH) {
20239         Perl_re_printf( aTHX_  "anchored");
20240         if (r->intflags & PREGf_ANCH_MBOL)
20241             Perl_re_printf( aTHX_  "(MBOL)");
20242         if (r->intflags & PREGf_ANCH_SBOL)
20243             Perl_re_printf( aTHX_  "(SBOL)");
20244         if (r->intflags & PREGf_ANCH_GPOS)
20245             Perl_re_printf( aTHX_  "(GPOS)");
20246         Perl_re_printf( aTHX_ " ");
20247     }
20248     if (r->intflags & PREGf_GPOS_SEEN)
20249         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
20250     if (r->intflags & PREGf_SKIP)
20251         Perl_re_printf( aTHX_  "plus ");
20252     if (r->intflags & PREGf_IMPLICIT)
20253         Perl_re_printf( aTHX_  "implicit ");
20254     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
20255     if (r->extflags & RXf_EVAL_SEEN)
20256         Perl_re_printf( aTHX_  "with eval ");
20257     Perl_re_printf( aTHX_  "\n");
20258     DEBUG_FLAGS_r({
20259         regdump_extflags("r->extflags: ", r->extflags);
20260         regdump_intflags("r->intflags: ", r->intflags);
20261     });
20262 #else
20263     PERL_ARGS_ASSERT_REGDUMP;
20264     PERL_UNUSED_CONTEXT;
20265     PERL_UNUSED_ARG(r);
20266 #endif  /* DEBUGGING */
20267 }
20268
20269 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20270 #ifdef DEBUGGING
20271
20272 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
20273      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
20274      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
20275      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
20276      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
20277      || _CC_VERTSPACE != 15
20278 #   error Need to adjust order of anyofs[]
20279 #  endif
20280 static const char * const anyofs[] = {
20281     "\\w",
20282     "\\W",
20283     "\\d",
20284     "\\D",
20285     "[:alpha:]",
20286     "[:^alpha:]",
20287     "[:lower:]",
20288     "[:^lower:]",
20289     "[:upper:]",
20290     "[:^upper:]",
20291     "[:punct:]",
20292     "[:^punct:]",
20293     "[:print:]",
20294     "[:^print:]",
20295     "[:alnum:]",
20296     "[:^alnum:]",
20297     "[:graph:]",
20298     "[:^graph:]",
20299     "[:cased:]",
20300     "[:^cased:]",
20301     "\\s",
20302     "\\S",
20303     "[:blank:]",
20304     "[:^blank:]",
20305     "[:xdigit:]",
20306     "[:^xdigit:]",
20307     "[:cntrl:]",
20308     "[:^cntrl:]",
20309     "[:ascii:]",
20310     "[:^ascii:]",
20311     "\\v",
20312     "\\V"
20313 };
20314 #endif
20315
20316 /*
20317 - regprop - printable representation of opcode, with run time support
20318 */
20319
20320 void
20321 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20322 {
20323 #ifdef DEBUGGING
20324     dVAR;
20325     int k;
20326     RXi_GET_DECL(prog, progi);
20327     GET_RE_DEBUG_FLAGS_DECL;
20328
20329     PERL_ARGS_ASSERT_REGPROP;
20330
20331     SvPVCLEAR(sv);
20332
20333     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
20334         if (pRExC_state) {  /* This gives more info, if we have it */
20335             FAIL3("panic: corrupted regexp opcode %d > %d",
20336                   (int)OP(o), (int)REGNODE_MAX);
20337         }
20338         else {
20339             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
20340                              (int)OP(o), (int)REGNODE_MAX);
20341         }
20342     }
20343     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20344
20345     k = PL_regkind[OP(o)];
20346
20347     if (k == EXACT) {
20348         sv_catpvs(sv, " ");
20349         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20350          * is a crude hack but it may be the best for now since
20351          * we have no flag "this EXACTish node was UTF-8"
20352          * --jhi */
20353         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20354                   PL_colors[0], PL_colors[1],
20355                   PERL_PV_ESCAPE_UNI_DETECT |
20356                   PERL_PV_ESCAPE_NONASCII   |
20357                   PERL_PV_PRETTY_ELLIPSES   |
20358                   PERL_PV_PRETTY_LTGT       |
20359                   PERL_PV_PRETTY_NOCLEAR
20360                   );
20361     } else if (k == TRIE) {
20362         /* print the details of the trie in dumpuntil instead, as
20363          * progi->data isn't available here */
20364         const char op = OP(o);
20365         const U32 n = ARG(o);
20366         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20367                (reg_ac_data *)progi->data->data[n] :
20368                NULL;
20369         const reg_trie_data * const trie
20370             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20371
20372         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20373         DEBUG_TRIE_COMPILE_r({
20374           if (trie->jump)
20375             sv_catpvs(sv, "(JUMP)");
20376           Perl_sv_catpvf(aTHX_ sv,
20377             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20378             (UV)trie->startstate,
20379             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20380             (UV)trie->wordcount,
20381             (UV)trie->minlen,
20382             (UV)trie->maxlen,
20383             (UV)TRIE_CHARCOUNT(trie),
20384             (UV)trie->uniquecharcount
20385           );
20386         });
20387         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20388             sv_catpvs(sv, "[");
20389             (void) put_charclass_bitmap_innards(sv,
20390                                                 ((IS_ANYOF_TRIE(op))
20391                                                  ? ANYOF_BITMAP(o)
20392                                                  : TRIE_BITMAP(trie)),
20393                                                 NULL,
20394                                                 NULL,
20395                                                 NULL,
20396                                                 FALSE
20397                                                );
20398             sv_catpvs(sv, "]");
20399         }
20400     } else if (k == CURLY) {
20401         U32 lo = ARG1(o), hi = ARG2(o);
20402         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20403             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20404         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20405         if (hi == REG_INFTY)
20406             sv_catpvs(sv, "INFTY");
20407         else
20408             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20409         sv_catpvs(sv, "}");
20410     }
20411     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
20412         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20413     else if (k == REF || k == OPEN || k == CLOSE
20414              || k == GROUPP || OP(o)==ACCEPT)
20415     {
20416         AV *name_list= NULL;
20417         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20418         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
20419         if ( RXp_PAREN_NAMES(prog) ) {
20420             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20421         } else if ( pRExC_state ) {
20422             name_list= RExC_paren_name_list;
20423         }
20424         if (name_list) {
20425             if ( k != REF || (OP(o) < REFN)) {
20426                 SV **name= av_fetch(name_list, parno, 0 );
20427                 if (name)
20428                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20429             }
20430             else {
20431                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20432                 I32 *nums=(I32*)SvPVX(sv_dat);
20433                 SV **name= av_fetch(name_list, nums[0], 0 );
20434                 I32 n;
20435                 if (name) {
20436                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
20437                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20438                                     (n ? "," : ""), (IV)nums[n]);
20439                     }
20440                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20441                 }
20442             }
20443         }
20444         if ( k == REF && reginfo) {
20445             U32 n = ARG(o);  /* which paren pair */
20446             I32 ln = prog->offs[n].start;
20447             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20448                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20449             else if (ln == prog->offs[n].end)
20450                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20451             else {
20452                 const char *s = reginfo->strbeg + ln;
20453                 Perl_sv_catpvf(aTHX_ sv, ": ");
20454                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20455                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20456             }
20457         }
20458     } else if (k == GOSUB) {
20459         AV *name_list= NULL;
20460         if ( RXp_PAREN_NAMES(prog) ) {
20461             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20462         } else if ( pRExC_state ) {
20463             name_list= RExC_paren_name_list;
20464         }
20465
20466         /* Paren and offset */
20467         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20468                 (int)((o + (int)ARG2L(o)) - progi->program) );
20469         if (name_list) {
20470             SV **name= av_fetch(name_list, ARG(o), 0 );
20471             if (name)
20472                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20473         }
20474     }
20475     else if (k == LOGICAL)
20476         /* 2: embedded, otherwise 1 */
20477         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20478     else if (k == ANYOF) {
20479         const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
20480                           ? 0
20481                           : ANYOF_FLAGS(o);
20482         bool do_sep = FALSE;    /* Do we need to separate various components of
20483                                    the output? */
20484         /* Set if there is still an unresolved user-defined property */
20485         SV *unresolved                = NULL;
20486
20487         /* Things that are ignored except when the runtime locale is UTF-8 */
20488         SV *only_utf8_locale_invlist = NULL;
20489
20490         /* Code points that don't fit in the bitmap */
20491         SV *nonbitmap_invlist = NULL;
20492
20493         /* And things that aren't in the bitmap, but are small enough to be */
20494         SV* bitmap_range_not_in_bitmap = NULL;
20495
20496         const bool inverted = flags & ANYOF_INVERT;
20497
20498         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20499             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20500                 sv_catpvs(sv, "{utf8-locale-reqd}");
20501             }
20502             if (flags & ANYOFL_FOLD) {
20503                 sv_catpvs(sv, "{i}");
20504             }
20505         }
20506
20507         /* If there is stuff outside the bitmap, get it */
20508         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
20509             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20510                                                 &unresolved,
20511                                                 &only_utf8_locale_invlist,
20512                                                 &nonbitmap_invlist);
20513             /* The non-bitmap data may contain stuff that could fit in the
20514              * bitmap.  This could come from a user-defined property being
20515              * finally resolved when this call was done; or much more likely
20516              * because there are matches that require UTF-8 to be valid, and so
20517              * aren't in the bitmap.  This is teased apart later */
20518             _invlist_intersection(nonbitmap_invlist,
20519                                   PL_InBitmap,
20520                                   &bitmap_range_not_in_bitmap);
20521             /* Leave just the things that don't fit into the bitmap */
20522             _invlist_subtract(nonbitmap_invlist,
20523                               PL_InBitmap,
20524                               &nonbitmap_invlist);
20525         }
20526
20527         /* Obey this flag to add all above-the-bitmap code points */
20528         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20529             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20530                                                       NUM_ANYOF_CODE_POINTS,
20531                                                       UV_MAX);
20532         }
20533
20534         /* Ready to start outputting.  First, the initial left bracket */
20535         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20536
20537         if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
20538             /* Then all the things that could fit in the bitmap */
20539             do_sep = put_charclass_bitmap_innards(sv,
20540                                                   ANYOF_BITMAP(o),
20541                                                   bitmap_range_not_in_bitmap,
20542                                                   only_utf8_locale_invlist,
20543                                                   o,
20544
20545                                                   /* Can't try inverting for a
20546                                                    * better display if there
20547                                                    * are things that haven't
20548                                                    * been resolved */
20549                                                   unresolved != NULL);
20550             SvREFCNT_dec(bitmap_range_not_in_bitmap);
20551
20552             /* If there are user-defined properties which haven't been defined
20553              * yet, output them.  If the result is not to be inverted, it is
20554              * clearest to output them in a separate [] from the bitmap range
20555              * stuff.  If the result is to be complemented, we have to show
20556              * everything in one [], as the inversion applies to the whole
20557              * thing.  Use {braces} to separate them from anything in the
20558              * bitmap and anything above the bitmap. */
20559             if (unresolved) {
20560                 if (inverted) {
20561                     if (! do_sep) { /* If didn't output anything in the bitmap
20562                                      */
20563                         sv_catpvs(sv, "^");
20564                     }
20565                     sv_catpvs(sv, "{");
20566                 }
20567                 else if (do_sep) {
20568                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20569                                                       PL_colors[0]);
20570                 }
20571                 sv_catsv(sv, unresolved);
20572                 if (inverted) {
20573                     sv_catpvs(sv, "}");
20574                 }
20575                 do_sep = ! inverted;
20576             }
20577         }
20578
20579         /* And, finally, add the above-the-bitmap stuff */
20580         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
20581             SV* contents;
20582
20583             /* See if truncation size is overridden */
20584             const STRLEN dump_len = (PL_dump_re_max_len > 256)
20585                                     ? PL_dump_re_max_len
20586                                     : 256;
20587
20588             /* This is output in a separate [] */
20589             if (do_sep) {
20590                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20591             }
20592
20593             /* And, for easy of understanding, it is shown in the
20594              * uncomplemented form if possible.  The one exception being if
20595              * there are unresolved items, where the inversion has to be
20596              * delayed until runtime */
20597             if (inverted && ! unresolved) {
20598                 _invlist_invert(nonbitmap_invlist);
20599                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
20600             }
20601
20602             contents = invlist_contents(nonbitmap_invlist,
20603                                         FALSE /* output suitable for catsv */
20604                                        );
20605
20606             /* If the output is shorter than the permissible maximum, just do it. */
20607             if (SvCUR(contents) <= dump_len) {
20608                 sv_catsv(sv, contents);
20609             }
20610             else {
20611                 const char * contents_string = SvPVX(contents);
20612                 STRLEN i = dump_len;
20613
20614                 /* Otherwise, start at the permissible max and work back to the
20615                  * first break possibility */
20616                 while (i > 0 && contents_string[i] != ' ') {
20617                     i--;
20618                 }
20619                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
20620                                        find a legal break */
20621                     i = dump_len;
20622                 }
20623
20624                 sv_catpvn(sv, contents_string, i);
20625                 sv_catpvs(sv, "...");
20626             }
20627
20628             SvREFCNT_dec_NN(contents);
20629             SvREFCNT_dec_NN(nonbitmap_invlist);
20630         }
20631
20632         /* And finally the matching, closing ']' */
20633         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20634
20635         if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
20636             U8 lowest = (OP(o) != ANYOFHr)
20637                          ? FLAGS(o)
20638                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
20639             U8 highest = (OP(o) == ANYOFHb)
20640                          ? lowest
20641                          : OP(o) == ANYOFH
20642                            ? 0xFF
20643                            : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
20644             Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
20645             if (lowest != highest) {
20646                 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
20647             }
20648             Perl_sv_catpvf(aTHX_ sv, ")");
20649         }
20650
20651         SvREFCNT_dec(unresolved);
20652     }
20653     else if (k == ANYOFM) {
20654         SV * cp_list = get_ANYOFM_contents(o);
20655
20656         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20657         if (OP(o) == NANYOFM) {
20658             _invlist_invert(cp_list);
20659         }
20660
20661         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
20662         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20663
20664         SvREFCNT_dec(cp_list);
20665     }
20666     else if (k == POSIXD || k == NPOSIXD) {
20667         U8 index = FLAGS(o) * 2;
20668         if (index < C_ARRAY_LENGTH(anyofs)) {
20669             if (*anyofs[index] != '[')  {
20670                 sv_catpvs(sv, "[");
20671             }
20672             sv_catpv(sv, anyofs[index]);
20673             if (*anyofs[index] != '[')  {
20674                 sv_catpvs(sv, "]");
20675             }
20676         }
20677         else {
20678             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
20679         }
20680     }
20681     else if (k == BOUND || k == NBOUND) {
20682         /* Must be synced with order of 'bound_type' in regcomp.h */
20683         const char * const bounds[] = {
20684             "",      /* Traditional */
20685             "{gcb}",
20686             "{lb}",
20687             "{sb}",
20688             "{wb}"
20689         };
20690         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
20691         sv_catpv(sv, bounds[FLAGS(o)]);
20692     }
20693     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
20694         Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
20695         if (o->next_off) {
20696             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
20697         }
20698         Perl_sv_catpvf(aTHX_ sv, "]");
20699     }
20700     else if (OP(o) == SBOL)
20701         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
20702
20703     /* add on the verb argument if there is one */
20704     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
20705         if ( ARG(o) )
20706             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
20707                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
20708         else
20709             sv_catpvs(sv, ":NULL");
20710     }
20711 #else
20712     PERL_UNUSED_CONTEXT;
20713     PERL_UNUSED_ARG(sv);
20714     PERL_UNUSED_ARG(o);
20715     PERL_UNUSED_ARG(prog);
20716     PERL_UNUSED_ARG(reginfo);
20717     PERL_UNUSED_ARG(pRExC_state);
20718 #endif  /* DEBUGGING */
20719 }
20720
20721
20722
20723 SV *
20724 Perl_re_intuit_string(pTHX_ REGEXP * const r)
20725 {                               /* Assume that RE_INTUIT is set */
20726     struct regexp *const prog = ReANY(r);
20727     GET_RE_DEBUG_FLAGS_DECL;
20728
20729     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
20730     PERL_UNUSED_CONTEXT;
20731
20732     DEBUG_COMPILE_r(
20733         {
20734             const char * const s = SvPV_nolen_const(RX_UTF8(r)
20735                       ? prog->check_utf8 : prog->check_substr);
20736
20737             if (!PL_colorset) reginitcolors();
20738             Perl_re_printf( aTHX_
20739                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
20740                       PL_colors[4],
20741                       RX_UTF8(r) ? "utf8 " : "",
20742                       PL_colors[5], PL_colors[0],
20743                       s,
20744                       PL_colors[1],
20745                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
20746         } );
20747
20748     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
20749     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
20750 }
20751
20752 /*
20753    pregfree()
20754
20755    handles refcounting and freeing the perl core regexp structure. When
20756    it is necessary to actually free the structure the first thing it
20757    does is call the 'free' method of the regexp_engine associated to
20758    the regexp, allowing the handling of the void *pprivate; member
20759    first. (This routine is not overridable by extensions, which is why
20760    the extensions free is called first.)
20761
20762    See regdupe and regdupe_internal if you change anything here.
20763 */
20764 #ifndef PERL_IN_XSUB_RE
20765 void
20766 Perl_pregfree(pTHX_ REGEXP *r)
20767 {
20768     SvREFCNT_dec(r);
20769 }
20770
20771 void
20772 Perl_pregfree2(pTHX_ REGEXP *rx)
20773 {
20774     struct regexp *const r = ReANY(rx);
20775     GET_RE_DEBUG_FLAGS_DECL;
20776
20777     PERL_ARGS_ASSERT_PREGFREE2;
20778
20779     if (! r)
20780         return;
20781
20782     if (r->mother_re) {
20783         ReREFCNT_dec(r->mother_re);
20784     } else {
20785         CALLREGFREE_PVT(rx); /* free the private data */
20786         SvREFCNT_dec(RXp_PAREN_NAMES(r));
20787     }
20788     if (r->substrs) {
20789         int i;
20790         for (i = 0; i < 2; i++) {
20791             SvREFCNT_dec(r->substrs->data[i].substr);
20792             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
20793         }
20794         Safefree(r->substrs);
20795     }
20796     RX_MATCH_COPY_FREE(rx);
20797 #ifdef PERL_ANY_COW
20798     SvREFCNT_dec(r->saved_copy);
20799 #endif
20800     Safefree(r->offs);
20801     SvREFCNT_dec(r->qr_anoncv);
20802     if (r->recurse_locinput)
20803         Safefree(r->recurse_locinput);
20804 }
20805
20806
20807 /*  reg_temp_copy()
20808
20809     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
20810     except that dsv will be created if NULL.
20811
20812     This function is used in two main ways. First to implement
20813         $r = qr/....; $s = $$r;
20814
20815     Secondly, it is used as a hacky workaround to the structural issue of
20816     match results
20817     being stored in the regexp structure which is in turn stored in
20818     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
20819     could be PL_curpm in multiple contexts, and could require multiple
20820     result sets being associated with the pattern simultaneously, such
20821     as when doing a recursive match with (??{$qr})
20822
20823     The solution is to make a lightweight copy of the regexp structure
20824     when a qr// is returned from the code executed by (??{$qr}) this
20825     lightweight copy doesn't actually own any of its data except for
20826     the starp/end and the actual regexp structure itself.
20827
20828 */
20829
20830
20831 REGEXP *
20832 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
20833 {
20834     struct regexp *drx;
20835     struct regexp *const srx = ReANY(ssv);
20836     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
20837
20838     PERL_ARGS_ASSERT_REG_TEMP_COPY;
20839
20840     if (!dsv)
20841         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
20842     else {
20843         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
20844
20845         /* our only valid caller, sv_setsv_flags(), should have done
20846          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
20847         assert(!SvOOK(dsv));
20848         assert(!SvIsCOW(dsv));
20849         assert(!SvROK(dsv));
20850
20851         if (SvPVX_const(dsv)) {
20852             if (SvLEN(dsv))
20853                 Safefree(SvPVX(dsv));
20854             SvPVX(dsv) = NULL;
20855         }
20856         SvLEN_set(dsv, 0);
20857         SvCUR_set(dsv, 0);
20858         SvOK_off((SV *)dsv);
20859
20860         if (islv) {
20861             /* For PVLVs, the head (sv_any) points to an XPVLV, while
20862              * the LV's xpvlenu_rx will point to a regexp body, which
20863              * we allocate here */
20864             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
20865             assert(!SvPVX(dsv));
20866             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
20867             temp->sv_any = NULL;
20868             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
20869             SvREFCNT_dec_NN(temp);
20870             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
20871                ing below will not set it. */
20872             SvCUR_set(dsv, SvCUR(ssv));
20873         }
20874     }
20875     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
20876        sv_force_normal(sv) is called.  */
20877     SvFAKE_on(dsv);
20878     drx = ReANY(dsv);
20879
20880     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
20881     SvPV_set(dsv, RX_WRAPPED(ssv));
20882     /* We share the same string buffer as the original regexp, on which we
20883        hold a reference count, incremented when mother_re is set below.
20884        The string pointer is copied here, being part of the regexp struct.
20885      */
20886     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
20887            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
20888     if (!islv)
20889         SvLEN_set(dsv, 0);
20890     if (srx->offs) {
20891         const I32 npar = srx->nparens+1;
20892         Newx(drx->offs, npar, regexp_paren_pair);
20893         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
20894     }
20895     if (srx->substrs) {
20896         int i;
20897         Newx(drx->substrs, 1, struct reg_substr_data);
20898         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
20899
20900         for (i = 0; i < 2; i++) {
20901             SvREFCNT_inc_void(drx->substrs->data[i].substr);
20902             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
20903         }
20904
20905         /* check_substr and check_utf8, if non-NULL, point to either their
20906            anchored or float namesakes, and don't hold a second reference.  */
20907     }
20908     RX_MATCH_COPIED_off(dsv);
20909 #ifdef PERL_ANY_COW
20910     drx->saved_copy = NULL;
20911 #endif
20912     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
20913     SvREFCNT_inc_void(drx->qr_anoncv);
20914     if (srx->recurse_locinput)
20915         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
20916
20917     return dsv;
20918 }
20919 #endif
20920
20921
20922 /* regfree_internal()
20923
20924    Free the private data in a regexp. This is overloadable by
20925    extensions. Perl takes care of the regexp structure in pregfree(),
20926    this covers the *pprivate pointer which technically perl doesn't
20927    know about, however of course we have to handle the
20928    regexp_internal structure when no extension is in use.
20929
20930    Note this is called before freeing anything in the regexp
20931    structure.
20932  */
20933
20934 void
20935 Perl_regfree_internal(pTHX_ REGEXP * const rx)
20936 {
20937     struct regexp *const r = ReANY(rx);
20938     RXi_GET_DECL(r, ri);
20939     GET_RE_DEBUG_FLAGS_DECL;
20940
20941     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
20942
20943     if (! ri) {
20944         return;
20945     }
20946
20947     DEBUG_COMPILE_r({
20948         if (!PL_colorset)
20949             reginitcolors();
20950         {
20951             SV *dsv= sv_newmortal();
20952             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
20953                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
20954             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
20955                 PL_colors[4], PL_colors[5], s);
20956         }
20957     });
20958
20959 #ifdef RE_TRACK_PATTERN_OFFSETS
20960     if (ri->u.offsets)
20961         Safefree(ri->u.offsets);             /* 20010421 MJD */
20962 #endif
20963     if (ri->code_blocks)
20964         S_free_codeblocks(aTHX_ ri->code_blocks);
20965
20966     if (ri->data) {
20967         int n = ri->data->count;
20968
20969         while (--n >= 0) {
20970           /* If you add a ->what type here, update the comment in regcomp.h */
20971             switch (ri->data->what[n]) {
20972             case 'a':
20973             case 'r':
20974             case 's':
20975             case 'S':
20976             case 'u':
20977                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
20978                 break;
20979             case 'f':
20980                 Safefree(ri->data->data[n]);
20981                 break;
20982             case 'l':
20983             case 'L':
20984                 break;
20985             case 'T':
20986                 { /* Aho Corasick add-on structure for a trie node.
20987                      Used in stclass optimization only */
20988                     U32 refcount;
20989                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
20990 #ifdef USE_ITHREADS
20991                     dVAR;
20992 #endif
20993                     OP_REFCNT_LOCK;
20994                     refcount = --aho->refcount;
20995                     OP_REFCNT_UNLOCK;
20996                     if ( !refcount ) {
20997                         PerlMemShared_free(aho->states);
20998                         PerlMemShared_free(aho->fail);
20999                          /* do this last!!!! */
21000                         PerlMemShared_free(ri->data->data[n]);
21001                         /* we should only ever get called once, so
21002                          * assert as much, and also guard the free
21003                          * which /might/ happen twice. At the least
21004                          * it will make code anlyzers happy and it
21005                          * doesn't cost much. - Yves */
21006                         assert(ri->regstclass);
21007                         if (ri->regstclass) {
21008                             PerlMemShared_free(ri->regstclass);
21009                             ri->regstclass = 0;
21010                         }
21011                     }
21012                 }
21013                 break;
21014             case 't':
21015                 {
21016                     /* trie structure. */
21017                     U32 refcount;
21018                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21019 #ifdef USE_ITHREADS
21020                     dVAR;
21021 #endif
21022                     OP_REFCNT_LOCK;
21023                     refcount = --trie->refcount;
21024                     OP_REFCNT_UNLOCK;
21025                     if ( !refcount ) {
21026                         PerlMemShared_free(trie->charmap);
21027                         PerlMemShared_free(trie->states);
21028                         PerlMemShared_free(trie->trans);
21029                         if (trie->bitmap)
21030                             PerlMemShared_free(trie->bitmap);
21031                         if (trie->jump)
21032                             PerlMemShared_free(trie->jump);
21033                         PerlMemShared_free(trie->wordinfo);
21034                         /* do this last!!!! */
21035                         PerlMemShared_free(ri->data->data[n]);
21036                     }
21037                 }
21038                 break;
21039             default:
21040                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21041                                                     ri->data->what[n]);
21042             }
21043         }
21044         Safefree(ri->data->what);
21045         Safefree(ri->data);
21046     }
21047
21048     Safefree(ri);
21049 }
21050
21051 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21052 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21053 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
21054
21055 /*
21056    re_dup_guts - duplicate a regexp.
21057
21058    This routine is expected to clone a given regexp structure. It is only
21059    compiled under USE_ITHREADS.
21060
21061    After all of the core data stored in struct regexp is duplicated
21062    the regexp_engine.dupe method is used to copy any private data
21063    stored in the *pprivate pointer. This allows extensions to handle
21064    any duplication it needs to do.
21065
21066    See pregfree() and regfree_internal() if you change anything here.
21067 */
21068 #if defined(USE_ITHREADS)
21069 #ifndef PERL_IN_XSUB_RE
21070 void
21071 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21072 {
21073     dVAR;
21074     I32 npar;
21075     const struct regexp *r = ReANY(sstr);
21076     struct regexp *ret = ReANY(dstr);
21077
21078     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21079
21080     npar = r->nparens+1;
21081     Newx(ret->offs, npar, regexp_paren_pair);
21082     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21083
21084     if (ret->substrs) {
21085         /* Do it this way to avoid reading from *r after the StructCopy().
21086            That way, if any of the sv_dup_inc()s dislodge *r from the L1
21087            cache, it doesn't matter.  */
21088         int i;
21089         const bool anchored = r->check_substr
21090             ? r->check_substr == r->substrs->data[0].substr
21091             : r->check_utf8   == r->substrs->data[0].utf8_substr;
21092         Newx(ret->substrs, 1, struct reg_substr_data);
21093         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21094
21095         for (i = 0; i < 2; i++) {
21096             ret->substrs->data[i].substr =
21097                         sv_dup_inc(ret->substrs->data[i].substr, param);
21098             ret->substrs->data[i].utf8_substr =
21099                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21100         }
21101
21102         /* check_substr and check_utf8, if non-NULL, point to either their
21103            anchored or float namesakes, and don't hold a second reference.  */
21104
21105         if (ret->check_substr) {
21106             if (anchored) {
21107                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21108
21109                 ret->check_substr = ret->substrs->data[0].substr;
21110                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21111             } else {
21112                 assert(r->check_substr == r->substrs->data[1].substr);
21113                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21114
21115                 ret->check_substr = ret->substrs->data[1].substr;
21116                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21117             }
21118         } else if (ret->check_utf8) {
21119             if (anchored) {
21120                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21121             } else {
21122                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21123             }
21124         }
21125     }
21126
21127     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21128     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21129     if (r->recurse_locinput)
21130         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21131
21132     if (ret->pprivate)
21133         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21134
21135     if (RX_MATCH_COPIED(dstr))
21136         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21137     else
21138         ret->subbeg = NULL;
21139 #ifdef PERL_ANY_COW
21140     ret->saved_copy = NULL;
21141 #endif
21142
21143     /* Whether mother_re be set or no, we need to copy the string.  We
21144        cannot refrain from copying it when the storage points directly to
21145        our mother regexp, because that's
21146                1: a buffer in a different thread
21147                2: something we no longer hold a reference on
21148                so we need to copy it locally.  */
21149     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21150     /* set malloced length to a non-zero value so it will be freed
21151      * (otherwise in combination with SVf_FAKE it looks like an alien
21152      * buffer). It doesn't have to be the actual malloced size, since it
21153      * should never be grown */
21154     SvLEN_set(dstr, SvCUR(sstr)+1);
21155     ret->mother_re   = NULL;
21156 }
21157 #endif /* PERL_IN_XSUB_RE */
21158
21159 /*
21160    regdupe_internal()
21161
21162    This is the internal complement to regdupe() which is used to copy
21163    the structure pointed to by the *pprivate pointer in the regexp.
21164    This is the core version of the extension overridable cloning hook.
21165    The regexp structure being duplicated will be copied by perl prior
21166    to this and will be provided as the regexp *r argument, however
21167    with the /old/ structures pprivate pointer value. Thus this routine
21168    may override any copying normally done by perl.
21169
21170    It returns a pointer to the new regexp_internal structure.
21171 */
21172
21173 void *
21174 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21175 {
21176     dVAR;
21177     struct regexp *const r = ReANY(rx);
21178     regexp_internal *reti;
21179     int len;
21180     RXi_GET_DECL(r, ri);
21181
21182     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21183
21184     len = ProgLen(ri);
21185
21186     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21187           char, regexp_internal);
21188     Copy(ri->program, reti->program, len+1, regnode);
21189
21190
21191     if (ri->code_blocks) {
21192         int n;
21193         Newx(reti->code_blocks, 1, struct reg_code_blocks);
21194         Newx(reti->code_blocks->cb, ri->code_blocks->count,
21195                     struct reg_code_block);
21196         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21197              ri->code_blocks->count, struct reg_code_block);
21198         for (n = 0; n < ri->code_blocks->count; n++)
21199              reti->code_blocks->cb[n].src_regex = (REGEXP*)
21200                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21201         reti->code_blocks->count = ri->code_blocks->count;
21202         reti->code_blocks->refcnt = 1;
21203     }
21204     else
21205         reti->code_blocks = NULL;
21206
21207     reti->regstclass = NULL;
21208
21209     if (ri->data) {
21210         struct reg_data *d;
21211         const int count = ri->data->count;
21212         int i;
21213
21214         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
21215                 char, struct reg_data);
21216         Newx(d->what, count, U8);
21217
21218         d->count = count;
21219         for (i = 0; i < count; i++) {
21220             d->what[i] = ri->data->what[i];
21221             switch (d->what[i]) {
21222                 /* see also regcomp.h and regfree_internal() */
21223             case 'a': /* actually an AV, but the dup function is identical.
21224                          values seem to be "plain sv's" generally. */
21225             case 'r': /* a compiled regex (but still just another SV) */
21226             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
21227                          this use case should go away, the code could have used
21228                          'a' instead - see S_set_ANYOF_arg() for array contents. */
21229             case 'S': /* actually an SV, but the dup function is identical.  */
21230             case 'u': /* actually an HV, but the dup function is identical.
21231                          values are "plain sv's" */
21232                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
21233                 break;
21234             case 'f':
21235                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
21236                  * patterns which could start with several different things. Pre-TRIE
21237                  * this was more important than it is now, however this still helps
21238                  * in some places, for instance /x?a+/ might produce a SSC equivalent
21239                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
21240                  * in regexec.c
21241                  */
21242                 /* This is cheating. */
21243                 Newx(d->data[i], 1, regnode_ssc);
21244                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
21245                 reti->regstclass = (regnode*)d->data[i];
21246                 break;
21247             case 'T':
21248                 /* AHO-CORASICK fail table */
21249                 /* Trie stclasses are readonly and can thus be shared
21250                  * without duplication. We free the stclass in pregfree
21251                  * when the corresponding reg_ac_data struct is freed.
21252                  */
21253                 reti->regstclass= ri->regstclass;
21254                 /* FALLTHROUGH */
21255             case 't':
21256                 /* TRIE transition table */
21257                 OP_REFCNT_LOCK;
21258                 ((reg_trie_data*)ri->data->data[i])->refcount++;
21259                 OP_REFCNT_UNLOCK;
21260                 /* FALLTHROUGH */
21261             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
21262             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
21263                          is not from another regexp */
21264                 d->data[i] = ri->data->data[i];
21265                 break;
21266             default:
21267                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
21268                                                            ri->data->what[i]);
21269             }
21270         }
21271
21272         reti->data = d;
21273     }
21274     else
21275         reti->data = NULL;
21276
21277     reti->name_list_idx = ri->name_list_idx;
21278
21279 #ifdef RE_TRACK_PATTERN_OFFSETS
21280     if (ri->u.offsets) {
21281         Newx(reti->u.offsets, 2*len+1, U32);
21282         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
21283     }
21284 #else
21285     SetProgLen(reti, len);
21286 #endif
21287
21288     return (void*)reti;
21289 }
21290
21291 #endif    /* USE_ITHREADS */
21292
21293 #ifndef PERL_IN_XSUB_RE
21294
21295 /*
21296  - regnext - dig the "next" pointer out of a node
21297  */
21298 regnode *
21299 Perl_regnext(pTHX_ regnode *p)
21300 {
21301     I32 offset;
21302
21303     if (!p)
21304         return(NULL);
21305
21306     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
21307         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21308                                                 (int)OP(p), (int)REGNODE_MAX);
21309     }
21310
21311     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21312     if (offset == 0)
21313         return(NULL);
21314
21315     return(p+offset);
21316 }
21317
21318 #endif
21319
21320 STATIC void
21321 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21322 {
21323     va_list args;
21324     STRLEN l1 = strlen(pat1);
21325     STRLEN l2 = strlen(pat2);
21326     char buf[512];
21327     SV *msv;
21328     const char *message;
21329
21330     PERL_ARGS_ASSERT_RE_CROAK2;
21331
21332     if (l1 > 510)
21333         l1 = 510;
21334     if (l1 + l2 > 510)
21335         l2 = 510 - l1;
21336     Copy(pat1, buf, l1 , char);
21337     Copy(pat2, buf + l1, l2 , char);
21338     buf[l1 + l2] = '\n';
21339     buf[l1 + l2 + 1] = '\0';
21340     va_start(args, pat2);
21341     msv = vmess(buf, &args);
21342     va_end(args);
21343     message = SvPV_const(msv, l1);
21344     if (l1 > 512)
21345         l1 = 512;
21346     Copy(message, buf, l1 , char);
21347     /* l1-1 to avoid \n */
21348     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21349 }
21350
21351 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
21352
21353 #ifndef PERL_IN_XSUB_RE
21354 void
21355 Perl_save_re_context(pTHX)
21356 {
21357     I32 nparens = -1;
21358     I32 i;
21359
21360     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21361
21362     if (PL_curpm) {
21363         const REGEXP * const rx = PM_GETRE(PL_curpm);
21364         if (rx)
21365             nparens = RX_NPARENS(rx);
21366     }
21367
21368     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21369      * that PL_curpm will be null, but that utf8.pm and the modules it
21370      * loads will only use $1..$3.
21371      * The t/porting/re_context.t test file checks this assumption.
21372      */
21373     if (nparens == -1)
21374         nparens = 3;
21375
21376     for (i = 1; i <= nparens; i++) {
21377         char digits[TYPE_CHARS(long)];
21378         const STRLEN len = my_snprintf(digits, sizeof(digits),
21379                                        "%lu", (long)i);
21380         GV *const *const gvp
21381             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21382
21383         if (gvp) {
21384             GV * const gv = *gvp;
21385             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21386                 save_scalar(gv);
21387         }
21388     }
21389 }
21390 #endif
21391
21392 #ifdef DEBUGGING
21393
21394 STATIC void
21395 S_put_code_point(pTHX_ SV *sv, UV c)
21396 {
21397     PERL_ARGS_ASSERT_PUT_CODE_POINT;
21398
21399     if (c > 255) {
21400         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21401     }
21402     else if (isPRINT(c)) {
21403         const char string = (char) c;
21404
21405         /* We use {phrase} as metanotation in the class, so also escape literal
21406          * braces */
21407         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21408             sv_catpvs(sv, "\\");
21409         sv_catpvn(sv, &string, 1);
21410     }
21411     else if (isMNEMONIC_CNTRL(c)) {
21412         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21413     }
21414     else {
21415         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21416     }
21417 }
21418
21419 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21420
21421 STATIC void
21422 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21423 {
21424     /* Appends to 'sv' a displayable version of the range of code points from
21425      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
21426      * that have them, when they occur at the beginning or end of the range.
21427      * It uses hex to output the remaining code points, unless 'allow_literals'
21428      * is true, in which case the printable ASCII ones are output as-is (though
21429      * some of these will be escaped by put_code_point()).
21430      *
21431      * NOTE:  This is designed only for printing ranges of code points that fit
21432      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
21433      */
21434
21435     const unsigned int min_range_count = 3;
21436
21437     assert(start <= end);
21438
21439     PERL_ARGS_ASSERT_PUT_RANGE;
21440
21441     while (start <= end) {
21442         UV this_end;
21443         const char * format;
21444
21445         if (end - start < min_range_count) {
21446
21447             /* Output chars individually when they occur in short ranges */
21448             for (; start <= end; start++) {
21449                 put_code_point(sv, start);
21450             }
21451             break;
21452         }
21453
21454         /* If permitted by the input options, and there is a possibility that
21455          * this range contains a printable literal, look to see if there is
21456          * one. */
21457         if (allow_literals && start <= MAX_PRINT_A) {
21458
21459             /* If the character at the beginning of the range isn't an ASCII
21460              * printable, effectively split the range into two parts:
21461              *  1) the portion before the first such printable,
21462              *  2) the rest
21463              * and output them separately. */
21464             if (! isPRINT_A(start)) {
21465                 UV temp_end = start + 1;
21466
21467                 /* There is no point looking beyond the final possible
21468                  * printable, in MAX_PRINT_A */
21469                 UV max = MIN(end, MAX_PRINT_A);
21470
21471                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21472                     temp_end++;
21473                 }
21474
21475                 /* Here, temp_end points to one beyond the first printable if
21476                  * found, or to one beyond 'max' if not.  If none found, make
21477                  * sure that we use the entire range */
21478                 if (temp_end > MAX_PRINT_A) {
21479                     temp_end = end + 1;
21480                 }
21481
21482                 /* Output the first part of the split range: the part that
21483                  * doesn't have printables, with the parameter set to not look
21484                  * for literals (otherwise we would infinitely recurse) */
21485                 put_range(sv, start, temp_end - 1, FALSE);
21486
21487                 /* The 2nd part of the range (if any) starts here. */
21488                 start = temp_end;
21489
21490                 /* We do a continue, instead of dropping down, because even if
21491                  * the 2nd part is non-empty, it could be so short that we want
21492                  * to output it as individual characters, as tested for at the
21493                  * top of this loop.  */
21494                 continue;
21495             }
21496
21497             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
21498              * output a sub-range of just the digits or letters, then process
21499              * the remaining portion as usual. */
21500             if (isALPHANUMERIC_A(start)) {
21501                 UV mask = (isDIGIT_A(start))
21502                            ? _CC_DIGIT
21503                              : isUPPER_A(start)
21504                                ? _CC_UPPER
21505                                : _CC_LOWER;
21506                 UV temp_end = start + 1;
21507
21508                 /* Find the end of the sub-range that includes just the
21509                  * characters in the same class as the first character in it */
21510                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21511                     temp_end++;
21512                 }
21513                 temp_end--;
21514
21515                 /* For short ranges, don't duplicate the code above to output
21516                  * them; just call recursively */
21517                 if (temp_end - start < min_range_count) {
21518                     put_range(sv, start, temp_end, FALSE);
21519                 }
21520                 else {  /* Output as a range */
21521                     put_code_point(sv, start);
21522                     sv_catpvs(sv, "-");
21523                     put_code_point(sv, temp_end);
21524                 }
21525                 start = temp_end + 1;
21526                 continue;
21527             }
21528
21529             /* We output any other printables as individual characters */
21530             if (isPUNCT_A(start) || isSPACE_A(start)) {
21531                 while (start <= end && (isPUNCT_A(start)
21532                                         || isSPACE_A(start)))
21533                 {
21534                     put_code_point(sv, start);
21535                     start++;
21536                 }
21537                 continue;
21538             }
21539         } /* End of looking for literals */
21540
21541         /* Here is not to output as a literal.  Some control characters have
21542          * mnemonic names.  Split off any of those at the beginning and end of
21543          * the range to print mnemonically.  It isn't possible for many of
21544          * these to be in a row, so this won't overwhelm with output */
21545         if (   start <= end
21546             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21547         {
21548             while (isMNEMONIC_CNTRL(start) && start <= end) {
21549                 put_code_point(sv, start);
21550                 start++;
21551             }
21552
21553             /* If this didn't take care of the whole range ... */
21554             if (start <= end) {
21555
21556                 /* Look backwards from the end to find the final non-mnemonic
21557                  * */
21558                 UV temp_end = end;
21559                 while (isMNEMONIC_CNTRL(temp_end)) {
21560                     temp_end--;
21561                 }
21562
21563                 /* And separately output the interior range that doesn't start
21564                  * or end with mnemonics */
21565                 put_range(sv, start, temp_end, FALSE);
21566
21567                 /* Then output the mnemonic trailing controls */
21568                 start = temp_end + 1;
21569                 while (start <= end) {
21570                     put_code_point(sv, start);
21571                     start++;
21572                 }
21573                 break;
21574             }
21575         }
21576
21577         /* As a final resort, output the range or subrange as hex. */
21578
21579         this_end = (end < NUM_ANYOF_CODE_POINTS)
21580                     ? end
21581                     : NUM_ANYOF_CODE_POINTS - 1;
21582 #if NUM_ANYOF_CODE_POINTS > 256
21583         format = (this_end < 256)
21584                  ? "\\x%02" UVXf "-\\x%02" UVXf
21585                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
21586 #else
21587         format = "\\x%02" UVXf "-\\x%02" UVXf;
21588 #endif
21589         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
21590         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
21591         GCC_DIAG_RESTORE_STMT;
21592         break;
21593     }
21594 }
21595
21596 STATIC void
21597 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
21598 {
21599     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
21600      * 'invlist' */
21601
21602     UV start, end;
21603     bool allow_literals = TRUE;
21604
21605     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
21606
21607     /* Generally, it is more readable if printable characters are output as
21608      * literals, but if a range (nearly) spans all of them, it's best to output
21609      * it as a single range.  This code will use a single range if all but 2
21610      * ASCII printables are in it */
21611     invlist_iterinit(invlist);
21612     while (invlist_iternext(invlist, &start, &end)) {
21613
21614         /* If the range starts beyond the final printable, it doesn't have any
21615          * in it */
21616         if (start > MAX_PRINT_A) {
21617             break;
21618         }
21619
21620         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
21621          * all but two, the range must start and end no later than 2 from
21622          * either end */
21623         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
21624             if (end > MAX_PRINT_A) {
21625                 end = MAX_PRINT_A;
21626             }
21627             if (start < ' ') {
21628                 start = ' ';
21629             }
21630             if (end - start >= MAX_PRINT_A - ' ' - 2) {
21631                 allow_literals = FALSE;
21632             }
21633             break;
21634         }
21635     }
21636     invlist_iterfinish(invlist);
21637
21638     /* Here we have figured things out.  Output each range */
21639     invlist_iterinit(invlist);
21640     while (invlist_iternext(invlist, &start, &end)) {
21641         if (start >= NUM_ANYOF_CODE_POINTS) {
21642             break;
21643         }
21644         put_range(sv, start, end, allow_literals);
21645     }
21646     invlist_iterfinish(invlist);
21647
21648     return;
21649 }
21650
21651 STATIC SV*
21652 S_put_charclass_bitmap_innards_common(pTHX_
21653         SV* invlist,            /* The bitmap */
21654         SV* posixes,            /* Under /l, things like [:word:], \S */
21655         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
21656         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
21657         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
21658         const bool invert       /* Is the result to be inverted? */
21659 )
21660 {
21661     /* Create and return an SV containing a displayable version of the bitmap
21662      * and associated information determined by the input parameters.  If the
21663      * output would have been only the inversion indicator '^', NULL is instead
21664      * returned. */
21665
21666     dVAR;
21667     SV * output;
21668
21669     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
21670
21671     if (invert) {
21672         output = newSVpvs("^");
21673     }
21674     else {
21675         output = newSVpvs("");
21676     }
21677
21678     /* First, the code points in the bitmap that are unconditionally there */
21679     put_charclass_bitmap_innards_invlist(output, invlist);
21680
21681     /* Traditionally, these have been placed after the main code points */
21682     if (posixes) {
21683         sv_catsv(output, posixes);
21684     }
21685
21686     if (only_utf8 && _invlist_len(only_utf8)) {
21687         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
21688         put_charclass_bitmap_innards_invlist(output, only_utf8);
21689     }
21690
21691     if (not_utf8 && _invlist_len(not_utf8)) {
21692         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
21693         put_charclass_bitmap_innards_invlist(output, not_utf8);
21694     }
21695
21696     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
21697         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
21698         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
21699
21700         /* This is the only list in this routine that can legally contain code
21701          * points outside the bitmap range.  The call just above to
21702          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
21703          * output them here.  There's about a half-dozen possible, and none in
21704          * contiguous ranges longer than 2 */
21705         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21706             UV start, end;
21707             SV* above_bitmap = NULL;
21708
21709             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
21710
21711             invlist_iterinit(above_bitmap);
21712             while (invlist_iternext(above_bitmap, &start, &end)) {
21713                 UV i;
21714
21715                 for (i = start; i <= end; i++) {
21716                     put_code_point(output, i);
21717                 }
21718             }
21719             invlist_iterfinish(above_bitmap);
21720             SvREFCNT_dec_NN(above_bitmap);
21721         }
21722     }
21723
21724     if (invert && SvCUR(output) == 1) {
21725         return NULL;
21726     }
21727
21728     return output;
21729 }
21730
21731 STATIC bool
21732 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
21733                                      char *bitmap,
21734                                      SV *nonbitmap_invlist,
21735                                      SV *only_utf8_locale_invlist,
21736                                      const regnode * const node,
21737                                      const bool force_as_is_display)
21738 {
21739     /* Appends to 'sv' a displayable version of the innards of the bracketed
21740      * character class defined by the other arguments:
21741      *  'bitmap' points to the bitmap, or NULL if to ignore that.
21742      *  'nonbitmap_invlist' is an inversion list of the code points that are in
21743      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
21744      *      none.  The reasons for this could be that they require some
21745      *      condition such as the target string being or not being in UTF-8
21746      *      (under /d), or because they came from a user-defined property that
21747      *      was not resolved at the time of the regex compilation (under /u)
21748      *  'only_utf8_locale_invlist' is an inversion list of the code points that
21749      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
21750      *  'node' is the regex pattern ANYOF node.  It is needed only when the
21751      *      above two parameters are not null, and is passed so that this
21752      *      routine can tease apart the various reasons for them.
21753      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
21754      *      to invert things to see if that leads to a cleaner display.  If
21755      *      FALSE, this routine is free to use its judgment about doing this.
21756      *
21757      * It returns TRUE if there was actually something output.  (It may be that
21758      * the bitmap, etc is empty.)
21759      *
21760      * When called for outputting the bitmap of a non-ANYOF node, just pass the
21761      * bitmap, with the succeeding parameters set to NULL, and the final one to
21762      * FALSE.
21763      */
21764
21765     /* In general, it tries to display the 'cleanest' representation of the
21766      * innards, choosing whether to display them inverted or not, regardless of
21767      * whether the class itself is to be inverted.  However,  there are some
21768      * cases where it can't try inverting, as what actually matches isn't known
21769      * until runtime, and hence the inversion isn't either. */
21770
21771     dVAR;
21772     bool inverting_allowed = ! force_as_is_display;
21773
21774     int i;
21775     STRLEN orig_sv_cur = SvCUR(sv);
21776
21777     SV* invlist;            /* Inversion list we accumulate of code points that
21778                                are unconditionally matched */
21779     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
21780                                UTF-8 */
21781     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
21782                              */
21783     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
21784     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
21785                                        is UTF-8 */
21786
21787     SV* as_is_display;      /* The output string when we take the inputs
21788                                literally */
21789     SV* inverted_display;   /* The output string when we invert the inputs */
21790
21791     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
21792
21793     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
21794                                                    to match? */
21795     /* We are biased in favor of displaying things without them being inverted,
21796      * as that is generally easier to understand */
21797     const int bias = 5;
21798
21799     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
21800
21801     /* Start off with whatever code points are passed in.  (We clone, so we
21802      * don't change the caller's list) */
21803     if (nonbitmap_invlist) {
21804         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
21805         invlist = invlist_clone(nonbitmap_invlist, NULL);
21806     }
21807     else {  /* Worst case size is every other code point is matched */
21808         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
21809     }
21810
21811     if (flags) {
21812         if (OP(node) == ANYOFD) {
21813
21814             /* This flag indicates that the code points below 0x100 in the
21815              * nonbitmap list are precisely the ones that match only when the
21816              * target is UTF-8 (they should all be non-ASCII). */
21817             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
21818             {
21819                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
21820                 _invlist_subtract(invlist, only_utf8, &invlist);
21821             }
21822
21823             /* And this flag for matching all non-ASCII 0xFF and below */
21824             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
21825             {
21826                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
21827             }
21828         }
21829         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
21830
21831             /* If either of these flags are set, what matches isn't
21832              * determinable except during execution, so don't know enough here
21833              * to invert */
21834             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
21835                 inverting_allowed = FALSE;
21836             }
21837
21838             /* What the posix classes match also varies at runtime, so these
21839              * will be output symbolically. */
21840             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
21841                 int i;
21842
21843                 posixes = newSVpvs("");
21844                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
21845                     if (ANYOF_POSIXL_TEST(node, i)) {
21846                         sv_catpv(posixes, anyofs[i]);
21847                     }
21848                 }
21849             }
21850         }
21851     }
21852
21853     /* Accumulate the bit map into the unconditional match list */
21854     if (bitmap) {
21855         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
21856             if (BITMAP_TEST(bitmap, i)) {
21857                 int start = i++;
21858                 for (;
21859                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
21860                      i++)
21861                 { /* empty */ }
21862                 invlist = _add_range_to_invlist(invlist, start, i-1);
21863             }
21864         }
21865     }
21866
21867     /* Make sure that the conditional match lists don't have anything in them
21868      * that match unconditionally; otherwise the output is quite confusing.
21869      * This could happen if the code that populates these misses some
21870      * duplication. */
21871     if (only_utf8) {
21872         _invlist_subtract(only_utf8, invlist, &only_utf8);
21873     }
21874     if (not_utf8) {
21875         _invlist_subtract(not_utf8, invlist, &not_utf8);
21876     }
21877
21878     if (only_utf8_locale_invlist) {
21879
21880         /* Since this list is passed in, we have to make a copy before
21881          * modifying it */
21882         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
21883
21884         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
21885
21886         /* And, it can get really weird for us to try outputting an inverted
21887          * form of this list when it has things above the bitmap, so don't even
21888          * try */
21889         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21890             inverting_allowed = FALSE;
21891         }
21892     }
21893
21894     /* Calculate what the output would be if we take the input as-is */
21895     as_is_display = put_charclass_bitmap_innards_common(invlist,
21896                                                     posixes,
21897                                                     only_utf8,
21898                                                     not_utf8,
21899                                                     only_utf8_locale,
21900                                                     invert);
21901
21902     /* If have to take the output as-is, just do that */
21903     if (! inverting_allowed) {
21904         if (as_is_display) {
21905             sv_catsv(sv, as_is_display);
21906             SvREFCNT_dec_NN(as_is_display);
21907         }
21908     }
21909     else { /* But otherwise, create the output again on the inverted input, and
21910               use whichever version is shorter */
21911
21912         int inverted_bias, as_is_bias;
21913
21914         /* We will apply our bias to whichever of the the results doesn't have
21915          * the '^' */
21916         if (invert) {
21917             invert = FALSE;
21918             as_is_bias = bias;
21919             inverted_bias = 0;
21920         }
21921         else {
21922             invert = TRUE;
21923             as_is_bias = 0;
21924             inverted_bias = bias;
21925         }
21926
21927         /* Now invert each of the lists that contribute to the output,
21928          * excluding from the result things outside the possible range */
21929
21930         /* For the unconditional inversion list, we have to add in all the
21931          * conditional code points, so that when inverted, they will be gone
21932          * from it */
21933         _invlist_union(only_utf8, invlist, &invlist);
21934         _invlist_union(not_utf8, invlist, &invlist);
21935         _invlist_union(only_utf8_locale, invlist, &invlist);
21936         _invlist_invert(invlist);
21937         _invlist_intersection(invlist, PL_InBitmap, &invlist);
21938
21939         if (only_utf8) {
21940             _invlist_invert(only_utf8);
21941             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
21942         }
21943         else if (not_utf8) {
21944
21945             /* If a code point matches iff the target string is not in UTF-8,
21946              * then complementing the result has it not match iff not in UTF-8,
21947              * which is the same thing as matching iff it is UTF-8. */
21948             only_utf8 = not_utf8;
21949             not_utf8 = NULL;
21950         }
21951
21952         if (only_utf8_locale) {
21953             _invlist_invert(only_utf8_locale);
21954             _invlist_intersection(only_utf8_locale,
21955                                   PL_InBitmap,
21956                                   &only_utf8_locale);
21957         }
21958
21959         inverted_display = put_charclass_bitmap_innards_common(
21960                                             invlist,
21961                                             posixes,
21962                                             only_utf8,
21963                                             not_utf8,
21964                                             only_utf8_locale, invert);
21965
21966         /* Use the shortest representation, taking into account our bias
21967          * against showing it inverted */
21968         if (   inverted_display
21969             && (   ! as_is_display
21970                 || (  SvCUR(inverted_display) + inverted_bias
21971                     < SvCUR(as_is_display)    + as_is_bias)))
21972         {
21973             sv_catsv(sv, inverted_display);
21974         }
21975         else if (as_is_display) {
21976             sv_catsv(sv, as_is_display);
21977         }
21978
21979         SvREFCNT_dec(as_is_display);
21980         SvREFCNT_dec(inverted_display);
21981     }
21982
21983     SvREFCNT_dec_NN(invlist);
21984     SvREFCNT_dec(only_utf8);
21985     SvREFCNT_dec(not_utf8);
21986     SvREFCNT_dec(posixes);
21987     SvREFCNT_dec(only_utf8_locale);
21988
21989     return SvCUR(sv) > orig_sv_cur;
21990 }
21991
21992 #define CLEAR_OPTSTART                                                       \
21993     if (optstart) STMT_START {                                               \
21994         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
21995                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
21996         optstart=NULL;                                                       \
21997     } STMT_END
21998
21999 #define DUMPUNTIL(b,e)                                                       \
22000                     CLEAR_OPTSTART;                                          \
22001                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22002
22003 STATIC const regnode *
22004 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22005             const regnode *last, const regnode *plast,
22006             SV* sv, I32 indent, U32 depth)
22007 {
22008     U8 op = PSEUDO;     /* Arbitrary non-END op. */
22009     const regnode *next;
22010     const regnode *optstart= NULL;
22011
22012     RXi_GET_DECL(r, ri);
22013     GET_RE_DEBUG_FLAGS_DECL;
22014
22015     PERL_ARGS_ASSERT_DUMPUNTIL;
22016
22017 #ifdef DEBUG_DUMPUNTIL
22018     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22019         last ? last-start : 0, plast ? plast-start : 0);
22020 #endif
22021
22022     if (plast && plast < last)
22023         last= plast;
22024
22025     while (PL_regkind[op] != END && (!last || node < last)) {
22026         assert(node);
22027         /* While that wasn't END last time... */
22028         NODE_ALIGN(node);
22029         op = OP(node);
22030         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22031             indent--;
22032         next = regnext((regnode *)node);
22033
22034         /* Where, what. */
22035         if (OP(node) == OPTIMIZED) {
22036             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22037                 optstart = node;
22038             else
22039                 goto after_print;
22040         } else
22041             CLEAR_OPTSTART;
22042
22043         regprop(r, sv, node, NULL, NULL);
22044         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22045                       (int)(2*indent + 1), "", SvPVX_const(sv));
22046
22047         if (OP(node) != OPTIMIZED) {
22048             if (next == NULL)           /* Next ptr. */
22049                 Perl_re_printf( aTHX_  " (0)");
22050             else if (PL_regkind[(U8)op] == BRANCH
22051                      && PL_regkind[OP(next)] != BRANCH )
22052                 Perl_re_printf( aTHX_  " (FAIL)");
22053             else
22054                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22055             Perl_re_printf( aTHX_ "\n");
22056         }
22057
22058       after_print:
22059         if (PL_regkind[(U8)op] == BRANCHJ) {
22060             assert(next);
22061             {
22062                 const regnode *nnode = (OP(next) == LONGJMP
22063                                        ? regnext((regnode *)next)
22064                                        : next);
22065                 if (last && nnode > last)
22066                     nnode = last;
22067                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22068             }
22069         }
22070         else if (PL_regkind[(U8)op] == BRANCH) {
22071             assert(next);
22072             DUMPUNTIL(NEXTOPER(node), next);
22073         }
22074         else if ( PL_regkind[(U8)op]  == TRIE ) {
22075             const regnode *this_trie = node;
22076             const char op = OP(node);
22077             const U32 n = ARG(node);
22078             const reg_ac_data * const ac = op>=AHOCORASICK ?
22079                (reg_ac_data *)ri->data->data[n] :
22080                NULL;
22081             const reg_trie_data * const trie =
22082                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22083 #ifdef DEBUGGING
22084             AV *const trie_words
22085                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22086 #endif
22087             const regnode *nextbranch= NULL;
22088             I32 word_idx;
22089             SvPVCLEAR(sv);
22090             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22091                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22092
22093                 Perl_re_indentf( aTHX_  "%s ",
22094                     indent+3,
22095                     elem_ptr
22096                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22097                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22098                                 PL_colors[0], PL_colors[1],
22099                                 (SvUTF8(*elem_ptr)
22100                                  ? PERL_PV_ESCAPE_UNI
22101                                  : 0)
22102                                 | PERL_PV_PRETTY_ELLIPSES
22103                                 | PERL_PV_PRETTY_LTGT
22104                             )
22105                     : "???"
22106                 );
22107                 if (trie->jump) {
22108                     U16 dist= trie->jump[word_idx+1];
22109                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22110                                (UV)((dist ? this_trie + dist : next) - start));
22111                     if (dist) {
22112                         if (!nextbranch)
22113                             nextbranch= this_trie + trie->jump[0];
22114                         DUMPUNTIL(this_trie + dist, nextbranch);
22115                     }
22116                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22117                         nextbranch= regnext((regnode *)nextbranch);
22118                 } else {
22119                     Perl_re_printf( aTHX_  "\n");
22120                 }
22121             }
22122             if (last && next > last)
22123                 node= last;
22124             else
22125                 node= next;
22126         }
22127         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22128             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22129                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22130         }
22131         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22132             assert(next);
22133             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22134         }
22135         else if ( op == PLUS || op == STAR) {
22136             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22137         }
22138         else if (PL_regkind[(U8)op] == EXACT) {
22139             /* Literal string, where present. */
22140             node += NODE_SZ_STR(node) - 1;
22141             node = NEXTOPER(node);
22142         }
22143         else {
22144             node = NEXTOPER(node);
22145             node += regarglen[(U8)op];
22146         }
22147         if (op == CURLYX || op == OPEN || op == SROPEN)
22148             indent++;
22149     }
22150     CLEAR_OPTSTART;
22151 #ifdef DEBUG_DUMPUNTIL
22152     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22153 #endif
22154     return node;
22155 }
22156
22157 #endif  /* DEBUGGING */
22158
22159 #ifndef PERL_IN_XSUB_RE
22160
22161 #include "uni_keywords.h"
22162
22163 void
22164 Perl_init_uniprops(pTHX)
22165 {
22166     dVAR;
22167
22168     PL_user_def_props = newHV();
22169
22170 #ifdef USE_ITHREADS
22171
22172     HvSHAREKEYS_off(PL_user_def_props);
22173     PL_user_def_props_aTHX = aTHX;
22174
22175 #endif
22176
22177     /* Set up the inversion list global variables */
22178
22179     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22180     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22181     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22182     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22183     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22184     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22185     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22186     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22187     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22188     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22189     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22190     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22191     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22192     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22193     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22194     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22195
22196     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22197     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22198     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22199     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22200     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22201     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
22202     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
22203     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
22204     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
22205     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
22206     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
22207     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
22208     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
22209     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
22210     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
22211     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
22212
22213     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
22214     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
22215     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
22216     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
22217     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
22218
22219     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
22220     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
22221     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
22222
22223     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
22224
22225     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
22226     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
22227
22228     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
22229     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
22230
22231     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
22232     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22233                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
22234     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22235                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
22236     PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
22237                                             UNI__PERL_NON_FINAL_FOLDS]);
22238
22239     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
22240     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
22241     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
22242     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
22243     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
22244     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
22245     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
22246     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
22247     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
22248
22249 #ifdef UNI_XIDC
22250     /* The below are used only by deprecated functions.  They could be removed */
22251     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
22252     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
22253     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
22254 #endif
22255 }
22256
22257 #if 0
22258
22259 This code was mainly added for backcompat to give a warning for non-portable
22260 code points in user-defined properties.  But experiments showed that the
22261 warning in earlier perls were only omitted on overflow, which should be an
22262 error, so there really isnt a backcompat issue, and actually adding the
22263 warning when none was present before might cause breakage, for little gain.  So
22264 khw left this code in, but not enabled.  Tests were never added.
22265
22266 embed.fnc entry:
22267 Ei      |const char *|get_extended_utf8_msg|const UV cp
22268
22269 PERL_STATIC_INLINE const char *
22270 S_get_extended_utf8_msg(pTHX_ const UV cp)
22271 {
22272     U8 dummy[UTF8_MAXBYTES + 1];
22273     HV *msgs;
22274     SV **msg;
22275
22276     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
22277                              &msgs);
22278
22279     msg = hv_fetchs(msgs, "text", 0);
22280     assert(msg);
22281
22282     (void) sv_2mortal((SV *) msgs);
22283
22284     return SvPVX(*msg);
22285 }
22286
22287 #endif
22288
22289 SV *
22290 Perl_handle_user_defined_property(pTHX_
22291
22292     /* Parses the contents of a user-defined property definition; returning the
22293      * expanded definition if possible.  If so, the return is an inversion
22294      * list.
22295      *
22296      * If there are subroutines that are part of the expansion and which aren't
22297      * known at the time of the call to this function, this returns what
22298      * parse_uniprop_string() returned for the first one encountered.
22299      *
22300      * If an error was found, NULL is returned, and 'msg' gets a suitable
22301      * message appended to it.  (Appending allows the back trace of how we got
22302      * to the faulty definition to be displayed through nested calls of
22303      * user-defined subs.)
22304      *
22305      * The caller IS responsible for freeing any returned SV.
22306      *
22307      * The syntax of the contents is pretty much described in perlunicode.pod,
22308      * but we also allow comments on each line */
22309
22310     const char * name,          /* Name of property */
22311     const STRLEN name_len,      /* The name's length in bytes */
22312     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22313     const bool to_fold,         /* ? Is this under /i */
22314     const bool runtime,         /* ? Are we in compile- or run-time */
22315     const bool deferrable,      /* Is it ok for this property's full definition
22316                                    to be deferred until later? */
22317     SV* contents,               /* The property's definition */
22318     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
22319                                    getting called unless this is thought to be
22320                                    a user-defined property */
22321     SV * msg,                   /* Any error or warning msg(s) are appended to
22322                                    this */
22323     const STRLEN level)         /* Recursion level of this call */
22324 {
22325     STRLEN len;
22326     const char * string         = SvPV_const(contents, len);
22327     const char * const e        = string + len;
22328     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
22329     const STRLEN msgs_length_on_entry = SvCUR(msg);
22330
22331     const char * s0 = string;   /* Points to first byte in the current line
22332                                    being parsed in 'string' */
22333     const char overflow_msg[] = "Code point too large in \"";
22334     SV* running_definition = NULL;
22335
22336     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
22337
22338     *user_defined_ptr = TRUE;
22339
22340     /* Look at each line */
22341     while (s0 < e) {
22342         const char * s;     /* Current byte */
22343         char op = '+';      /* Default operation is 'union' */
22344         IV   min = 0;       /* range begin code point */
22345         IV   max = -1;      /* and range end */
22346         SV* this_definition;
22347
22348         /* Skip comment lines */
22349         if (*s0 == '#') {
22350             s0 = strchr(s0, '\n');
22351             if (s0 == NULL) {
22352                 break;
22353             }
22354             s0++;
22355             continue;
22356         }
22357
22358         /* For backcompat, allow an empty first line */
22359         if (*s0 == '\n') {
22360             s0++;
22361             continue;
22362         }
22363
22364         /* First character in the line may optionally be the operation */
22365         if (   *s0 == '+'
22366             || *s0 == '!'
22367             || *s0 == '-'
22368             || *s0 == '&')
22369         {
22370             op = *s0++;
22371         }
22372
22373         /* If the line is one or two hex digits separated by blank space, its
22374          * a range; otherwise it is either another user-defined property or an
22375          * error */
22376
22377         s = s0;
22378
22379         if (! isXDIGIT(*s)) {
22380             goto check_if_property;
22381         }
22382
22383         do { /* Each new hex digit will add 4 bits. */
22384             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
22385                 s = strchr(s, '\n');
22386                 if (s == NULL) {
22387                     s = e;
22388                 }
22389                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22390                 sv_catpv(msg, overflow_msg);
22391                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22392                                      UTF8fARG(is_contents_utf8, s - s0, s0));
22393                 sv_catpvs(msg, "\"");
22394                 goto return_failure;
22395             }
22396
22397             /* Accumulate this digit into the value */
22398             min = (min << 4) + READ_XDIGIT(s);
22399         } while (isXDIGIT(*s));
22400
22401         while (isBLANK(*s)) { s++; }
22402
22403         /* We allow comments at the end of the line */
22404         if (*s == '#') {
22405             s = strchr(s, '\n');
22406             if (s == NULL) {
22407                 s = e;
22408             }
22409             s++;
22410         }
22411         else if (s < e && *s != '\n') {
22412             if (! isXDIGIT(*s)) {
22413                 goto check_if_property;
22414             }
22415
22416             /* Look for the high point of the range */
22417             max = 0;
22418             do {
22419                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
22420                     s = strchr(s, '\n');
22421                     if (s == NULL) {
22422                         s = e;
22423                     }
22424                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22425                     sv_catpv(msg, overflow_msg);
22426                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22427                                       UTF8fARG(is_contents_utf8, s - s0, s0));
22428                     sv_catpvs(msg, "\"");
22429                     goto return_failure;
22430                 }
22431
22432                 max = (max << 4) + READ_XDIGIT(s);
22433             } while (isXDIGIT(*s));
22434
22435             while (isBLANK(*s)) { s++; }
22436
22437             if (*s == '#') {
22438                 s = strchr(s, '\n');
22439                 if (s == NULL) {
22440                     s = e;
22441                 }
22442             }
22443             else if (s < e && *s != '\n') {
22444                 goto check_if_property;
22445             }
22446         }
22447
22448         if (max == -1) {    /* The line only had one entry */
22449             max = min;
22450         }
22451         else if (max < min) {
22452             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22453             sv_catpvs(msg, "Illegal range in \"");
22454             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22455                                 UTF8fARG(is_contents_utf8, s - s0, s0));
22456             sv_catpvs(msg, "\"");
22457             goto return_failure;
22458         }
22459
22460 #if 0   /* See explanation at definition above of get_extended_utf8_msg() */
22461
22462         if (   UNICODE_IS_PERL_EXTENDED(min)
22463             || UNICODE_IS_PERL_EXTENDED(max))
22464         {
22465             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22466
22467             /* If both code points are non-portable, warn only on the lower
22468              * one. */
22469             sv_catpv(msg, get_extended_utf8_msg(
22470                                             (UNICODE_IS_PERL_EXTENDED(min))
22471                                             ? min : max));
22472             sv_catpvs(msg, " in \"");
22473             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22474                                  UTF8fARG(is_contents_utf8, s - s0, s0));
22475             sv_catpvs(msg, "\"");
22476         }
22477
22478 #endif
22479
22480         /* Here, this line contains a legal range */
22481         this_definition = sv_2mortal(_new_invlist(2));
22482         this_definition = _add_range_to_invlist(this_definition, min, max);
22483         goto calculate;
22484
22485       check_if_property:
22486
22487         /* Here it isn't a legal range line.  See if it is a legal property
22488          * line.  First find the end of the meat of the line */
22489         s = strpbrk(s, "#\n");
22490         if (s == NULL) {
22491             s = e;
22492         }
22493
22494         /* Ignore trailing blanks in keeping with the requirements of
22495          * parse_uniprop_string() */
22496         s--;
22497         while (s > s0 && isBLANK_A(*s)) {
22498             s--;
22499         }
22500         s++;
22501
22502         this_definition = parse_uniprop_string(s0, s - s0,
22503                                                is_utf8, to_fold, runtime,
22504                                                deferrable,
22505                                                user_defined_ptr, msg,
22506                                                (name_len == 0)
22507                                                 ? level /* Don't increase level
22508                                                            if input is empty */
22509                                                 : level + 1
22510                                               );
22511         if (this_definition == NULL) {
22512             goto return_failure;    /* 'msg' should have had the reason
22513                                        appended to it by the above call */
22514         }
22515
22516         if (! is_invlist(this_definition)) {    /* Unknown at this time */
22517             return newSVsv(this_definition);
22518         }
22519
22520         if (*s != '\n') {
22521             s = strchr(s, '\n');
22522             if (s == NULL) {
22523                 s = e;
22524             }
22525         }
22526
22527       calculate:
22528
22529         switch (op) {
22530             case '+':
22531                 _invlist_union(running_definition, this_definition,
22532                                                         &running_definition);
22533                 break;
22534             case '-':
22535                 _invlist_subtract(running_definition, this_definition,
22536                                                         &running_definition);
22537                 break;
22538             case '&':
22539                 _invlist_intersection(running_definition, this_definition,
22540                                                         &running_definition);
22541                 break;
22542             case '!':
22543                 _invlist_union_complement_2nd(running_definition,
22544                                         this_definition, &running_definition);
22545                 break;
22546             default:
22547                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
22548                                  __FILE__, __LINE__, op);
22549                 break;
22550         }
22551
22552         /* Position past the '\n' */
22553         s0 = s + 1;
22554     }   /* End of loop through the lines of 'contents' */
22555
22556     /* Here, we processed all the lines in 'contents' without error.  If we
22557      * didn't add any warnings, simply return success */
22558     if (msgs_length_on_entry == SvCUR(msg)) {
22559
22560         /* If the expansion was empty, the answer isn't nothing: its an empty
22561          * inversion list */
22562         if (running_definition == NULL) {
22563             running_definition = _new_invlist(1);
22564         }
22565
22566         return running_definition;
22567     }
22568
22569     /* Otherwise, add some explanatory text, but we will return success */
22570     goto return_msg;
22571
22572   return_failure:
22573     running_definition = NULL;
22574
22575   return_msg:
22576
22577     if (name_len > 0) {
22578         sv_catpvs(msg, " in expansion of ");
22579         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
22580     }
22581
22582     return running_definition;
22583 }
22584
22585 /* As explained below, certain operations need to take place in the first
22586  * thread created.  These macros switch contexts */
22587 #ifdef USE_ITHREADS
22588 #  define DECLARATION_FOR_GLOBAL_CONTEXT                                    \
22589                                         PerlInterpreter * save_aTHX = aTHX;
22590 #  define SWITCH_TO_GLOBAL_CONTEXT                                          \
22591                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
22592 #  define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
22593 #  define CUR_CONTEXT      aTHX
22594 #  define ORIGINAL_CONTEXT save_aTHX
22595 #else
22596 #  define DECLARATION_FOR_GLOBAL_CONTEXT
22597 #  define SWITCH_TO_GLOBAL_CONTEXT          NOOP
22598 #  define RESTORE_CONTEXT                   NOOP
22599 #  define CUR_CONTEXT                       NULL
22600 #  define ORIGINAL_CONTEXT                  NULL
22601 #endif
22602
22603 STATIC void
22604 S_delete_recursion_entry(pTHX_ void *key)
22605 {
22606     /* Deletes the entry used to detect recursion when expanding user-defined
22607      * properties.  This is a function so it can be set up to be called even if
22608      * the program unexpectedly quits */
22609
22610     dVAR;
22611     SV ** current_entry;
22612     const STRLEN key_len = strlen((const char *) key);
22613     DECLARATION_FOR_GLOBAL_CONTEXT;
22614
22615     SWITCH_TO_GLOBAL_CONTEXT;
22616
22617     /* If the entry is one of these types, it is a permanent entry, and not the
22618      * one used to detect recursions.  This function should delete only the
22619      * recursion entry */
22620     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
22621     if (     current_entry
22622         && ! is_invlist(*current_entry)
22623         && ! SvPOK(*current_entry))
22624     {
22625         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
22626                                                                     G_DISCARD);
22627     }
22628
22629     RESTORE_CONTEXT;
22630 }
22631
22632 STATIC SV *
22633 S_get_fq_name(pTHX_
22634               const char * const name,    /* The first non-blank in the \p{}, \P{} */
22635               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
22636               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22637               const bool has_colon_colon
22638              )
22639 {
22640     /* Returns a mortal SV containing the fully qualified version of the input
22641      * name */
22642
22643     SV * fq_name;
22644
22645     fq_name = newSVpvs_flags("", SVs_TEMP);
22646
22647     /* Use the current package if it wasn't included in our input */
22648     if (! has_colon_colon) {
22649         const HV * pkg = (IN_PERL_COMPILETIME)
22650                          ? PL_curstash
22651                          : CopSTASH(PL_curcop);
22652         const char* pkgname = HvNAME(pkg);
22653
22654         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
22655                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
22656         sv_catpvs(fq_name, "::");
22657     }
22658
22659     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
22660                          UTF8fARG(is_utf8, name_len, name));
22661     return fq_name;
22662 }
22663
22664 SV *
22665 Perl_parse_uniprop_string(pTHX_
22666
22667     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
22668      * now.  If so, the return is an inversion list.
22669      *
22670      * If the property is user-defined, it is a subroutine, which in turn
22671      * may call other subroutines.  This function will call the whole nest of
22672      * them to get the definition they return; if some aren't known at the time
22673      * of the call to this function, the fully qualified name of the highest
22674      * level sub is returned.  It is an error to call this function at runtime
22675      * without every sub defined.
22676      *
22677      * If an error was found, NULL is returned, and 'msg' gets a suitable
22678      * message appended to it.  (Appending allows the back trace of how we got
22679      * to the faulty definition to be displayed through nested calls of
22680      * user-defined subs.)
22681      *
22682      * The caller should NOT try to free any returned inversion list.
22683      *
22684      * Other parameters will be set on return as described below */
22685
22686     const char * const name,    /* The first non-blank in the \p{}, \P{} */
22687     const Size_t name_len,      /* Its length in bytes, not including any
22688                                    trailing space */
22689     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22690     const bool to_fold,         /* ? Is this under /i */
22691     const bool runtime,         /* TRUE if this is being called at run time */
22692     const bool deferrable,      /* TRUE if it's ok for the definition to not be
22693                                    known at this call */
22694     bool *user_defined_ptr,     /* Upon return from this function it will be
22695                                    set to TRUE if any component is a
22696                                    user-defined property */
22697     SV * msg,                   /* Any error or warning msg(s) are appended to
22698                                    this */
22699    const STRLEN level)          /* Recursion level of this call */
22700 {
22701     dVAR;
22702     char* lookup_name;          /* normalized name for lookup in our tables */
22703     unsigned lookup_len;        /* Its length */
22704     bool stricter = FALSE;      /* Some properties have stricter name
22705                                    normalization rules, which we decide upon
22706                                    based on parsing */
22707
22708     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
22709      * (though it requires extra effort to download them from Unicode and
22710      * compile perl to know about them) */
22711     bool is_nv_type = FALSE;
22712
22713     unsigned int i, j = 0;
22714     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
22715     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
22716     int table_index = 0;    /* The entry number for this property in the table
22717                                of all Unicode property names */
22718     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
22719     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
22720                                    the normalized name in certain situations */
22721     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
22722                                    part of a package name */
22723     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
22724                                              property rather than a Unicode
22725                                              one. */
22726     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
22727                                      if an error.  If it is an inversion list,
22728                                      it is the definition.  Otherwise it is a
22729                                      string containing the fully qualified sub
22730                                      name of 'name' */
22731     SV * fq_name = NULL;        /* For user-defined properties, the fully
22732                                    qualified name */
22733     bool invert_return = FALSE; /* ? Do we need to complement the result before
22734                                      returning it */
22735
22736     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
22737
22738     /* The input will be normalized into 'lookup_name' */
22739     Newx(lookup_name, name_len, char);
22740     SAVEFREEPV(lookup_name);
22741
22742     /* Parse the input. */
22743     for (i = 0; i < name_len; i++) {
22744         char cur = name[i];
22745
22746         /* Most of the characters in the input will be of this ilk, being parts
22747          * of a name */
22748         if (isIDCONT_A(cur)) {
22749
22750             /* Case differences are ignored.  Our lookup routine assumes
22751              * everything is lowercase, so normalize to that */
22752             if (isUPPER_A(cur)) {
22753                 lookup_name[j++] = toLOWER_A(cur);
22754                 continue;
22755             }
22756
22757             if (cur == '_') { /* Don't include these in the normalized name */
22758                 continue;
22759             }
22760
22761             lookup_name[j++] = cur;
22762
22763             /* The first character in a user-defined name must be of this type.
22764              * */
22765             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
22766                 could_be_user_defined = FALSE;
22767             }
22768
22769             continue;
22770         }
22771
22772         /* Here, the character is not something typically in a name,  But these
22773          * two types of characters (and the '_' above) can be freely ignored in
22774          * most situations.  Later it may turn out we shouldn't have ignored
22775          * them, and we have to reparse, but we don't have enough information
22776          * yet to make that decision */
22777         if (cur == '-' || isSPACE_A(cur)) {
22778             could_be_user_defined = FALSE;
22779             continue;
22780         }
22781
22782         /* An equals sign or single colon mark the end of the first part of
22783          * the property name */
22784         if (    cur == '='
22785             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
22786         {
22787             lookup_name[j++] = '='; /* Treat the colon as an '=' */
22788             equals_pos = j; /* Note where it occurred in the input */
22789             could_be_user_defined = FALSE;
22790             break;
22791         }
22792
22793         /* Otherwise, this character is part of the name. */
22794         lookup_name[j++] = cur;
22795
22796         /* Here it isn't a single colon, so if it is a colon, it must be a
22797          * double colon */
22798         if (cur == ':') {
22799
22800             /* A double colon should be a package qualifier.  We note its
22801              * position and continue.  Note that one could have
22802              *      pkg1::pkg2::...::foo
22803              * so that the position at the end of the loop will be just after
22804              * the final qualifier */
22805
22806             i++;
22807             non_pkg_begin = i + 1;
22808             lookup_name[j++] = ':';
22809         }
22810         else { /* Only word chars (and '::') can be in a user-defined name */
22811             could_be_user_defined = FALSE;
22812         }
22813     } /* End of parsing through the lhs of the property name (or all of it if
22814          no rhs) */
22815
22816 #define STRLENs(s)  (sizeof("" s "") - 1)
22817
22818     /* If there is a single package name 'utf8::', it is ambiguous.  It could
22819      * be for a user-defined property, or it could be a Unicode property, as
22820      * all of them are considered to be for that package.  For the purposes of
22821      * parsing the rest of the property, strip it off */
22822     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
22823         lookup_name +=  STRLENs("utf8::");
22824         j -=  STRLENs("utf8::");
22825         equals_pos -=  STRLENs("utf8::");
22826     }
22827
22828     /* Here, we are either done with the whole property name, if it was simple;
22829      * or are positioned just after the '=' if it is compound. */
22830
22831     if (equals_pos >= 0) {
22832         assert(! stricter); /* We shouldn't have set this yet */
22833
22834         /* Space immediately after the '=' is ignored */
22835         i++;
22836         for (; i < name_len; i++) {
22837             if (! isSPACE_A(name[i])) {
22838                 break;
22839             }
22840         }
22841
22842         /* Most punctuation after the equals indicates a subpattern, like
22843          * \p{foo=/bar/} */
22844         if (   isPUNCT_A(name[i])
22845             && name[i] != '-'
22846             && name[i] != '+'
22847             && name[i] != '_'
22848             && name[i] != '{')
22849         {
22850             /* Find the property.  The table includes the equals sign, so we
22851              * use 'j' as-is */
22852             table_index = match_uniprop((U8 *) lookup_name, j);
22853             if (table_index) {
22854                 const char * const * prop_values
22855                                             = UNI_prop_value_ptrs[table_index];
22856                 SV * subpattern;
22857                 Size_t subpattern_len;
22858                 REGEXP * subpattern_re;
22859                 char open = name[i++];
22860                 char close;
22861                 const char * pos_in_brackets;
22862                 bool escaped = 0;
22863
22864                 /* A backslash means the real delimitter is the next character.
22865                  * */
22866                 if (open == '\\') {
22867                     open = name[i++];
22868                     escaped = 1;
22869                 }
22870
22871                 /* This data structure is constructed so that the matching
22872                  * closing bracket is 3 past its matching opening.  The second
22873                  * set of closing is so that if the opening is something like
22874                  * ']', the closing will be that as well.  Something similar is
22875                  * done in toke.c */
22876                 pos_in_brackets = strchr("([<)]>)]>", open);
22877                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
22878
22879                 if (    i >= name_len
22880                     ||  name[name_len-1] != close
22881                     || (escaped && name[name_len-2] != '\\'))
22882                 {
22883                     sv_catpvs(msg, "Unicode property wildcard not terminated");
22884                     goto append_name_to_msg;
22885                 }
22886
22887                 Perl_ck_warner_d(aTHX_
22888                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
22889                     "The Unicode property wildcards feature is experimental");
22890
22891                 /* Now create and compile the wildcard subpattern.  Use /iaa
22892                  * because nothing outside of ASCII will match, and it the
22893                  * property values should all match /i.  Note that when the
22894                  * pattern fails to compile, our added text to the user's
22895                  * pattern will be displayed to the user, which is not so
22896                  * desirable. */
22897                 subpattern_len = name_len - i - 1 - escaped;
22898                 subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
22899                                               (unsigned) subpattern_len,
22900                                               name + i);
22901                 subpattern = sv_2mortal(subpattern);
22902                 subpattern_re = re_compile(subpattern, 0);
22903                 assert(subpattern_re);  /* Should have died if didn't compile
22904                                          successfully */
22905
22906                 /* For each legal property value, see if the supplied pattern
22907                  * matches it. */
22908                 while (*prop_values) {
22909                     const char * const entry = *prop_values;
22910                     const Size_t len = strlen(entry);
22911                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
22912
22913                     if (pregexec(subpattern_re,
22914                                  (char *) entry,
22915                                  (char *) entry + len,
22916                                  (char *) entry, 0,
22917                                  entry_sv,
22918                                  0))
22919                     { /* Here, matched.  Add to the returned list */
22920                         Size_t total_len = j + len;
22921                         SV * sub_invlist = NULL;
22922                         char * this_string;
22923
22924                         /* We know this is a legal \p{property=value}.  Call
22925                          * the function to return the list of code points that
22926                          * match it */
22927                         Newxz(this_string, total_len + 1, char);
22928                         Copy(lookup_name, this_string, j, char);
22929                         my_strlcat(this_string, entry, total_len + 1);
22930                         SAVEFREEPV(this_string);
22931                         sub_invlist = parse_uniprop_string(this_string,
22932                                                            total_len,
22933                                                            is_utf8,
22934                                                            to_fold,
22935                                                            runtime,
22936                                                            deferrable,
22937                                                            user_defined_ptr,
22938                                                            msg,
22939                                                            level + 1);
22940                         _invlist_union(prop_definition, sub_invlist,
22941                                        &prop_definition);
22942                     }
22943
22944                     prop_values++;  /* Next iteration, look at next propvalue */
22945                 } /* End of looking through property values; (the data
22946                      structure is terminated by a NULL ptr) */
22947
22948                 SvREFCNT_dec_NN(subpattern_re);
22949
22950                 if (prop_definition) {
22951                     return prop_definition;
22952                 }
22953
22954                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
22955                 goto append_name_to_msg;
22956             }
22957
22958             /* Here's how khw thinks we should proceed to handle the properties
22959              * not yet done:    Bidi Mirroring Glyph
22960                                 Bidi Paired Bracket
22961                                 Case Folding  (both full and simple)
22962                                 Decomposition Mapping
22963                                 Equivalent Unified Ideograph
22964                                 Name
22965                                 Name Alias
22966                                 Lowercase Mapping  (both full and simple)
22967                                 NFKC Case Fold
22968                                 Titlecase Mapping  (both full and simple)
22969                                 Uppercase Mapping  (both full and simple)
22970              * Move the part that looks at the property values into a perl
22971              * script, like utf8_heavy.pl is done.  This makes things somewhat
22972              * easier, but most importantly, it avoids always adding all these
22973              * strings to the memory usage when the feature is little-used.
22974              *
22975              * The property values would all be concatenated into a single
22976              * string per property with each value on a separate line, and the
22977              * code point it's for on alternating lines.  Then we match the
22978              * user's input pattern m//mg, without having to worry about their
22979              * uses of '^' and '$'.  Only the values that aren't the default
22980              * would be in the strings.  Code points would be in UTF-8.  The
22981              * search pattern that we would construct would look like
22982              * (?: \n (code-point_re) \n (?aam: user-re ) \n )
22983              * And so $1 would contain the code point that matched the user-re.
22984              * For properties where the default is the code point itself, such
22985              * as any of the case changing mappings, the string would otherwise
22986              * consist of all Unicode code points in UTF-8 strung together.
22987              * This would be impractical.  So instead, examine their compiled
22988              * pattern, looking at the ssc.  If none, reject the pattern as an
22989              * error.  Otherwise run the pattern against every code point in
22990              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
22991              * And it might be good to create an API to return the ssc.
22992              *
22993              * For the name properties, a new function could be created in
22994              * charnames which essentially does the same thing as above,
22995              * sharing Name.pl with the other charname functions.  Don't know
22996              * about loose name matching, or algorithmically determined names.
22997              * Decomposition.pl similarly.
22998              *
22999              * It might be that a new pattern modifier would have to be
23000              * created, like /t for resTricTed, which changed the behavior of
23001              * some constructs in their subpattern, like \A. */
23002         } /* End of is a wildcard subppattern */
23003
23004
23005         /* Certain properties whose values are numeric need special handling.
23006          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
23007          * purposes of checking if this is one of those properties */
23008         if (memBEGINPs(lookup_name, j, "is")) {
23009             lookup_offset = 2;
23010         }
23011
23012         /* Then check if it is one of these specially-handled properties.  The
23013          * possibilities are hard-coded because easier this way, and the list
23014          * is unlikely to change.
23015          *
23016          * All numeric value type properties are of this ilk, and are also
23017          * special in a different way later on.  So find those first.  There
23018          * are several numeric value type properties in the Unihan DB (which is
23019          * unlikely to be compiled with perl, but we handle it here in case it
23020          * does get compiled).  They all end with 'numeric'.  The interiors
23021          * aren't checked for the precise property.  This would stop working if
23022          * a cjk property were to be created that ended with 'numeric' and
23023          * wasn't a numeric type */
23024         is_nv_type = memEQs(lookup_name + lookup_offset,
23025                        j - 1 - lookup_offset, "numericvalue")
23026                   || memEQs(lookup_name + lookup_offset,
23027                       j - 1 - lookup_offset, "nv")
23028                   || (   memENDPs(lookup_name + lookup_offset,
23029                             j - 1 - lookup_offset, "numeric")
23030                       && (   memBEGINPs(lookup_name + lookup_offset,
23031                                       j - 1 - lookup_offset, "cjk")
23032                           || memBEGINPs(lookup_name + lookup_offset,
23033                                       j - 1 - lookup_offset, "k")));
23034         if (   is_nv_type
23035             || memEQs(lookup_name + lookup_offset,
23036                       j - 1 - lookup_offset, "canonicalcombiningclass")
23037             || memEQs(lookup_name + lookup_offset,
23038                       j - 1 - lookup_offset, "ccc")
23039             || memEQs(lookup_name + lookup_offset,
23040                       j - 1 - lookup_offset, "age")
23041             || memEQs(lookup_name + lookup_offset,
23042                       j - 1 - lookup_offset, "in")
23043             || memEQs(lookup_name + lookup_offset,
23044                       j - 1 - lookup_offset, "presentin"))
23045         {
23046             unsigned int k;
23047
23048             /* Since the stuff after the '=' is a number, we can't throw away
23049              * '-' willy-nilly, as those could be a minus sign.  Other stricter
23050              * rules also apply.  However, these properties all can have the
23051              * rhs not be a number, in which case they contain at least one
23052              * alphabetic.  In those cases, the stricter rules don't apply.
23053              * But the numeric type properties can have the alphas [Ee] to
23054              * signify an exponent, and it is still a number with stricter
23055              * rules.  So look for an alpha that signifies not-strict */
23056             stricter = TRUE;
23057             for (k = i; k < name_len; k++) {
23058                 if (   isALPHA_A(name[k])
23059                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
23060                 {
23061                     stricter = FALSE;
23062                     break;
23063                 }
23064             }
23065         }
23066
23067         if (stricter) {
23068
23069             /* A number may have a leading '+' or '-'.  The latter is retained
23070              * */
23071             if (name[i] == '+') {
23072                 i++;
23073             }
23074             else if (name[i] == '-') {
23075                 lookup_name[j++] = '-';
23076                 i++;
23077             }
23078
23079             /* Skip leading zeros including single underscores separating the
23080              * zeros, or between the final leading zero and the first other
23081              * digit */
23082             for (; i < name_len - 1; i++) {
23083                 if (    name[i] != '0'
23084                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23085                 {
23086                     break;
23087                 }
23088             }
23089         }
23090     }
23091     else {  /* No '=' */
23092
23093        /* Only a few properties without an '=' should be parsed with stricter
23094         * rules.  The list is unlikely to change. */
23095         if (   memBEGINPs(lookup_name, j, "perl")
23096             && memNEs(lookup_name + 4, j - 4, "space")
23097             && memNEs(lookup_name + 4, j - 4, "word"))
23098         {
23099             stricter = TRUE;
23100
23101             /* We set the inputs back to 0 and the code below will reparse,
23102              * using strict */
23103             i = j = 0;
23104         }
23105     }
23106
23107     /* Here, we have either finished the property, or are positioned to parse
23108      * the remainder, and we know if stricter rules apply.  Finish out, if not
23109      * already done */
23110     for (; i < name_len; i++) {
23111         char cur = name[i];
23112
23113         /* In all instances, case differences are ignored, and we normalize to
23114          * lowercase */
23115         if (isUPPER_A(cur)) {
23116             lookup_name[j++] = toLOWER(cur);
23117             continue;
23118         }
23119
23120         /* An underscore is skipped, but not under strict rules unless it
23121          * separates two digits */
23122         if (cur == '_') {
23123             if (    stricter
23124                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
23125                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
23126             {
23127                 lookup_name[j++] = '_';
23128             }
23129             continue;
23130         }
23131
23132         /* Hyphens are skipped except under strict */
23133         if (cur == '-' && ! stricter) {
23134             continue;
23135         }
23136
23137         /* XXX Bug in documentation.  It says white space skipped adjacent to
23138          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
23139          * in a number */
23140         if (isSPACE_A(cur) && ! stricter) {
23141             continue;
23142         }
23143
23144         lookup_name[j++] = cur;
23145
23146         /* Unless this is a non-trailing slash, we are done with it */
23147         if (i >= name_len - 1 || cur != '/') {
23148             continue;
23149         }
23150
23151         slash_pos = j;
23152
23153         /* A slash in the 'numeric value' property indicates that what follows
23154          * is a denominator.  It can have a leading '+' and '0's that should be
23155          * skipped.  But we have never allowed a negative denominator, so treat
23156          * a minus like every other character.  (No need to rule out a second
23157          * '/', as that won't match anything anyway */
23158         if (is_nv_type) {
23159             i++;
23160             if (i < name_len && name[i] == '+') {
23161                 i++;
23162             }
23163
23164             /* Skip leading zeros including underscores separating digits */
23165             for (; i < name_len - 1; i++) {
23166                 if (   name[i] != '0'
23167                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23168                 {
23169                     break;
23170                 }
23171             }
23172
23173             /* Store the first real character in the denominator */
23174             if (i < name_len) {
23175                 lookup_name[j++] = name[i];
23176             }
23177         }
23178     }
23179
23180     /* Here are completely done parsing the input 'name', and 'lookup_name'
23181      * contains a copy, normalized.
23182      *
23183      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
23184      * different from without the underscores.  */
23185     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
23186            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
23187         && UNLIKELY(name[name_len-1] == '_'))
23188     {
23189         lookup_name[j++] = '&';
23190     }
23191
23192     /* If the original input began with 'In' or 'Is', it could be a subroutine
23193      * call to a user-defined property instead of a Unicode property name. */
23194     if (    name_len - non_pkg_begin > 2
23195         &&  name[non_pkg_begin+0] == 'I'
23196         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
23197     {
23198         /* Names that start with In have different characterstics than those
23199          * that start with Is */
23200         if (name[non_pkg_begin+1] == 's') {
23201             starts_with_Is = TRUE;
23202         }
23203     }
23204     else {
23205         could_be_user_defined = FALSE;
23206     }
23207
23208     if (could_be_user_defined) {
23209         CV* user_sub;
23210
23211         /* If the user defined property returns the empty string, it could
23212          * easily be because the pattern is being compiled before the data it
23213          * actually needs to compile is available.  This could be argued to be
23214          * a bug in the perl code, but this is a change of behavior for Perl,
23215          * so we handle it.  This means that intentionally returning nothing
23216          * will not be resolved until runtime */
23217         bool empty_return = FALSE;
23218
23219         /* Here, the name could be for a user defined property, which are
23220          * implemented as subs. */
23221         user_sub = get_cvn_flags(name, name_len, 0);
23222         if (user_sub) {
23223             const char insecure[] = "Insecure user-defined property";
23224
23225             /* Here, there is a sub by the correct name.  Normally we call it
23226              * to get the property definition */
23227             dSP;
23228             SV * user_sub_sv = MUTABLE_SV(user_sub);
23229             SV * error;     /* Any error returned by calling 'user_sub' */
23230             SV * key;       /* The key into the hash of user defined sub names
23231                              */
23232             SV * placeholder;
23233             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
23234
23235             /* How many times to retry when another thread is in the middle of
23236              * expanding the same definition we want */
23237             PERL_INT_FAST8_T retry_countdown = 10;
23238
23239             DECLARATION_FOR_GLOBAL_CONTEXT;
23240
23241             /* If we get here, we know this property is user-defined */
23242             *user_defined_ptr = TRUE;
23243
23244             /* We refuse to call a potentially tainted subroutine; returning an
23245              * error instead */
23246             if (TAINT_get) {
23247                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23248                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23249                 goto append_name_to_msg;
23250             }
23251
23252             /* In principal, we only call each subroutine property definition
23253              * once during the life of the program.  This guarantees that the
23254              * property definition never changes.  The results of the single
23255              * sub call are stored in a hash, which is used instead for future
23256              * references to this property.  The property definition is thus
23257              * immutable.  But, to allow the user to have a /i-dependent
23258              * definition, we call the sub once for non-/i, and once for /i,
23259              * should the need arise, passing the /i status as a parameter.
23260              *
23261              * We start by constructing the hash key name, consisting of the
23262              * fully qualified subroutine name, preceded by the /i status, so
23263              * that there is a key for /i and a different key for non-/i */
23264             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
23265             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23266                                           non_pkg_begin != 0);
23267             sv_catsv(key, fq_name);
23268             sv_2mortal(key);
23269
23270             /* We only call the sub once throughout the life of the program
23271              * (with the /i, non-/i exception noted above).  That means the
23272              * hash must be global and accessible to all threads.  It is
23273              * created at program start-up, before any threads are created, so
23274              * is accessible to all children.  But this creates some
23275              * complications.
23276              *
23277              * 1) The keys can't be shared, or else problems arise; sharing is
23278              *    turned off at hash creation time
23279              * 2) All SVs in it are there for the remainder of the life of the
23280              *    program, and must be created in the same interpreter context
23281              *    as the hash, or else they will be freed from the wrong pool
23282              *    at global destruction time.  This is handled by switching to
23283              *    the hash's context to create each SV going into it, and then
23284              *    immediately switching back
23285              * 3) All accesses to the hash must be controlled by a mutex, to
23286              *    prevent two threads from getting an unstable state should
23287              *    they simultaneously be accessing it.  The code below is
23288              *    crafted so that the mutex is locked whenever there is an
23289              *    access and unlocked only when the next stable state is
23290              *    achieved.
23291              *
23292              * The hash stores either the definition of the property if it was
23293              * valid, or, if invalid, the error message that was raised.  We
23294              * use the type of SV to distinguish.
23295              *
23296              * There's also the need to guard against the definition expansion
23297              * from infinitely recursing.  This is handled by storing the aTHX
23298              * of the expanding thread during the expansion.  Again the SV type
23299              * is used to distinguish this from the other two cases.  If we
23300              * come to here and the hash entry for this property is our aTHX,
23301              * it means we have recursed, and the code assumes that we would
23302              * infinitely recurse, so instead stops and raises an error.
23303              * (Any recursion has always been treated as infinite recursion in
23304              * this feature.)
23305              *
23306              * If instead, the entry is for a different aTHX, it means that
23307              * that thread has gotten here first, and hasn't finished expanding
23308              * the definition yet.  We just have to wait until it is done.  We
23309              * sleep and retry a few times, returning an error if the other
23310              * thread doesn't complete. */
23311
23312           re_fetch:
23313             USER_PROP_MUTEX_LOCK;
23314
23315             /* If we have an entry for this key, the subroutine has already
23316              * been called once with this /i status. */
23317             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
23318                                                    SvPVX(key), SvCUR(key), 0);
23319             if (saved_user_prop_ptr) {
23320
23321                 /* If the saved result is an inversion list, it is the valid
23322                  * definition of this property */
23323                 if (is_invlist(*saved_user_prop_ptr)) {
23324                     prop_definition = *saved_user_prop_ptr;
23325
23326                     /* The SV in the hash won't be removed until global
23327                      * destruction, so it is stable and we can unlock */
23328                     USER_PROP_MUTEX_UNLOCK;
23329
23330                     /* The caller shouldn't try to free this SV */
23331                     return prop_definition;
23332                 }
23333
23334                 /* Otherwise, if it is a string, it is the error message
23335                  * that was returned when we first tried to evaluate this
23336                  * property.  Fail, and append the message */
23337                 if (SvPOK(*saved_user_prop_ptr)) {
23338                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23339                     sv_catsv(msg, *saved_user_prop_ptr);
23340
23341                     /* The SV in the hash won't be removed until global
23342                      * destruction, so it is stable and we can unlock */
23343                     USER_PROP_MUTEX_UNLOCK;
23344
23345                     return NULL;
23346                 }
23347
23348                 assert(SvIOK(*saved_user_prop_ptr));
23349
23350                 /* Here, we have an unstable entry in the hash.  Either another
23351                  * thread is in the middle of expanding the property's
23352                  * definition, or we are ourselves recursing.  We use the aTHX
23353                  * in it to distinguish */
23354                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
23355
23356                     /* Here, it's another thread doing the expanding.  We've
23357                      * looked as much as we are going to at the contents of the
23358                      * hash entry.  It's safe to unlock. */
23359                     USER_PROP_MUTEX_UNLOCK;
23360
23361                     /* Retry a few times */
23362                     if (retry_countdown-- > 0) {
23363                         PerlProc_sleep(1);
23364                         goto re_fetch;
23365                     }
23366
23367                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23368                     sv_catpvs(msg, "Timeout waiting for another thread to "
23369                                    "define");
23370                     goto append_name_to_msg;
23371                 }
23372
23373                 /* Here, we are recursing; don't dig any deeper */
23374                 USER_PROP_MUTEX_UNLOCK;
23375
23376                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23377                 sv_catpvs(msg,
23378                           "Infinite recursion in user-defined property");
23379                 goto append_name_to_msg;
23380             }
23381
23382             /* Here, this thread has exclusive control, and there is no entry
23383              * for this property in the hash.  So we have the go ahead to
23384              * expand the definition ourselves. */
23385
23386             PUSHSTACKi(PERLSI_MAGIC);
23387             ENTER;
23388
23389             /* Create a temporary placeholder in the hash to detect recursion
23390              * */
23391             SWITCH_TO_GLOBAL_CONTEXT;
23392             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
23393             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
23394             RESTORE_CONTEXT;
23395
23396             /* Now that we have a placeholder, we can let other threads
23397              * continue */
23398             USER_PROP_MUTEX_UNLOCK;
23399
23400             /* Make sure the placeholder always gets destroyed */
23401             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
23402
23403             PUSHMARK(SP);
23404             SAVETMPS;
23405
23406             /* Call the user's function, with the /i status as a parameter.
23407              * Note that we have gone to a lot of trouble to keep this call
23408              * from being within the locked mutex region. */
23409             XPUSHs(boolSV(to_fold));
23410             PUTBACK;
23411
23412             /* The following block was taken from swash_init().  Presumably
23413              * they apply to here as well, though we no longer use a swash --
23414              * khw */
23415             SAVEHINTS();
23416             save_re_context();
23417             /* We might get here via a subroutine signature which uses a utf8
23418              * parameter name, at which point PL_subname will have been set
23419              * but not yet used. */
23420             save_item(PL_subname);
23421
23422             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
23423
23424             SPAGAIN;
23425
23426             error = ERRSV;
23427             if (TAINT_get || SvTRUE(error)) {
23428                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23429                 if (SvTRUE(error)) {
23430                     sv_catpvs(msg, "Error \"");
23431                     sv_catsv(msg, error);
23432                     sv_catpvs(msg, "\"");
23433                 }
23434                 if (TAINT_get) {
23435                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
23436                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23437                 }
23438
23439                 if (name_len > 0) {
23440                     sv_catpvs(msg, " in expansion of ");
23441                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
23442                                                                   name_len,
23443                                                                   name));
23444                 }
23445
23446                 (void) POPs;
23447                 prop_definition = NULL;
23448             }
23449             else {  /* G_SCALAR guarantees a single return value */
23450                 SV * contents = POPs;
23451
23452                 /* The contents is supposed to be the expansion of the property
23453                  * definition.  If the definition is deferrable, and we got an
23454                  * empty string back, set a flag to later defer it (after clean
23455                  * up below). */
23456                 if (      deferrable
23457                     && (! SvPOK(contents) || SvCUR(contents) == 0))
23458                 {
23459                         empty_return = TRUE;
23460                 }
23461                 else { /* Otherwise, call a function to check for valid syntax,
23462                           and handle it */
23463
23464                     prop_definition = handle_user_defined_property(
23465                                                     name, name_len,
23466                                                     is_utf8, to_fold, runtime,
23467                                                     deferrable,
23468                                                     contents, user_defined_ptr,
23469                                                     msg,
23470                                                     level);
23471                 }
23472             }
23473
23474             /* Here, we have the results of the expansion.  Delete the
23475              * placeholder, and if the definition is now known, replace it with
23476              * that definition.  We need exclusive access to the hash, and we
23477              * can't let anyone else in, between when we delete the placeholder
23478              * and add the permanent entry */
23479             USER_PROP_MUTEX_LOCK;
23480
23481             S_delete_recursion_entry(aTHX_ SvPVX(key));
23482
23483             if (    ! empty_return
23484                 && (! prop_definition || is_invlist(prop_definition)))
23485             {
23486                 /* If we got success we use the inversion list defining the
23487                  * property; otherwise use the error message */
23488                 SWITCH_TO_GLOBAL_CONTEXT;
23489                 (void) hv_store_ent(PL_user_def_props,
23490                                     key,
23491                                     ((prop_definition)
23492                                      ? newSVsv(prop_definition)
23493                                      : newSVsv(msg)),
23494                                     0);
23495                 RESTORE_CONTEXT;
23496             }
23497
23498             /* All done, and the hash now has a permanent entry for this
23499              * property.  Give up exclusive control */
23500             USER_PROP_MUTEX_UNLOCK;
23501
23502             FREETMPS;
23503             LEAVE;
23504             POPSTACK;
23505
23506             if (empty_return) {
23507                 goto definition_deferred;
23508             }
23509
23510             if (prop_definition) {
23511
23512                 /* If the definition is for something not known at this time,
23513                  * we toss it, and go return the main property name, as that's
23514                  * the one the user will be aware of */
23515                 if (! is_invlist(prop_definition)) {
23516                     SvREFCNT_dec_NN(prop_definition);
23517                     goto definition_deferred;
23518                 }
23519
23520                 sv_2mortal(prop_definition);
23521             }
23522
23523             /* And return */
23524             return prop_definition;
23525
23526         }   /* End of calling the subroutine for the user-defined property */
23527     }       /* End of it could be a user-defined property */
23528
23529     /* Here it wasn't a user-defined property that is known at this time.  See
23530      * if it is a Unicode property */
23531
23532     lookup_len = j;     /* This is a more mnemonic name than 'j' */
23533
23534     /* Get the index into our pointer table of the inversion list corresponding
23535      * to the property */
23536     table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23537
23538     /* If it didn't find the property ... */
23539     if (table_index == 0) {
23540
23541         /* Try again stripping off any initial 'Is'.  This is because we
23542          * promise that an initial Is is optional.  The same isn't true of
23543          * names that start with 'In'.  Those can match only blocks, and the
23544          * lookup table already has those accounted for. */
23545         if (starts_with_Is) {
23546             lookup_name += 2;
23547             lookup_len -= 2;
23548             equals_pos -= 2;
23549             slash_pos -= 2;
23550
23551             table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23552         }
23553
23554         if (table_index == 0) {
23555             char * canonical;
23556
23557             /* Here, we didn't find it.  If not a numeric type property, and
23558              * can't be a user-defined one, it isn't a legal property */
23559             if (! is_nv_type) {
23560                 if (! could_be_user_defined) {
23561                     goto failed;
23562                 }
23563
23564                 /* Here, the property name is legal as a user-defined one.   At
23565                  * compile time, it might just be that the subroutine for that
23566                  * property hasn't been encountered yet, but at runtime, it's
23567                  * an error to try to use an undefined one */
23568                 if (! deferrable) {
23569                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23570                     sv_catpvs(msg, "Unknown user-defined property name");
23571                     goto append_name_to_msg;
23572                 }
23573
23574                 goto definition_deferred;
23575             } /* End of isn't a numeric type property */
23576
23577             /* The numeric type properties need more work to decide.  What we
23578              * do is make sure we have the number in canonical form and look
23579              * that up. */
23580
23581             if (slash_pos < 0) {    /* No slash */
23582
23583                 /* When it isn't a rational, take the input, convert it to a
23584                  * NV, then create a canonical string representation of that
23585                  * NV. */
23586
23587                 NV value;
23588                 SSize_t value_len = lookup_len - equals_pos;
23589
23590                 /* Get the value */
23591                 if (   value_len <= 0
23592                     || my_atof3(lookup_name + equals_pos, &value,
23593                                 value_len)
23594                           != lookup_name + lookup_len)
23595                 {
23596                     goto failed;
23597                 }
23598
23599                 /* If the value is an integer, the canonical value is integral
23600                  * */
23601                 if (Perl_ceil(value) == value) {
23602                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
23603                                             equals_pos, lookup_name, value);
23604                 }
23605                 else {  /* Otherwise, it is %e with a known precision */
23606                     char * exp_ptr;
23607
23608                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
23609                                                 equals_pos, lookup_name,
23610                                                 PL_E_FORMAT_PRECISION, value);
23611
23612                     /* The exponent generated is expecting two digits, whereas
23613                      * %e on some systems will generate three.  Remove leading
23614                      * zeros in excess of 2 from the exponent.  We start
23615                      * looking for them after the '=' */
23616                     exp_ptr = strchr(canonical + equals_pos, 'e');
23617                     if (exp_ptr) {
23618                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
23619                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
23620
23621                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
23622
23623                         if (excess_exponent_len > 0) {
23624                             SSize_t leading_zeros = strspn(cur_ptr, "0");
23625                             SSize_t excess_leading_zeros
23626                                     = MIN(leading_zeros, excess_exponent_len);
23627                             if (excess_leading_zeros > 0) {
23628                                 Move(cur_ptr + excess_leading_zeros,
23629                                      cur_ptr,
23630                                      strlen(cur_ptr) - excess_leading_zeros
23631                                        + 1,  /* Copy the NUL as well */
23632                                      char);
23633                             }
23634                         }
23635                     }
23636                 }
23637             }
23638             else {  /* Has a slash.  Create a rational in canonical form  */
23639                 UV numerator, denominator, gcd, trial;
23640                 const char * end_ptr;
23641                 const char * sign = "";
23642
23643                 /* We can't just find the numerator, denominator, and do the
23644                  * division, then use the method above, because that is
23645                  * inexact.  And the input could be a rational that is within
23646                  * epsilon (given our precision) of a valid rational, and would
23647                  * then incorrectly compare valid.
23648                  *
23649                  * We're only interested in the part after the '=' */
23650                 const char * this_lookup_name = lookup_name + equals_pos;
23651                 lookup_len -= equals_pos;
23652                 slash_pos -= equals_pos;
23653
23654                 /* Handle any leading minus */
23655                 if (this_lookup_name[0] == '-') {
23656                     sign = "-";
23657                     this_lookup_name++;
23658                     lookup_len--;
23659                     slash_pos--;
23660                 }
23661
23662                 /* Convert the numerator to numeric */
23663                 end_ptr = this_lookup_name + slash_pos;
23664                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
23665                     goto failed;
23666                 }
23667
23668                 /* It better have included all characters before the slash */
23669                 if (*end_ptr != '/') {
23670                     goto failed;
23671                 }
23672
23673                 /* Set to look at just the denominator */
23674                 this_lookup_name += slash_pos;
23675                 lookup_len -= slash_pos;
23676                 end_ptr = this_lookup_name + lookup_len;
23677
23678                 /* Convert the denominator to numeric */
23679                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
23680                     goto failed;
23681                 }
23682
23683                 /* It better be the rest of the characters, and don't divide by
23684                  * 0 */
23685                 if (   end_ptr != this_lookup_name + lookup_len
23686                     || denominator == 0)
23687                 {
23688                     goto failed;
23689                 }
23690
23691                 /* Get the greatest common denominator using
23692                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
23693                 gcd = numerator;
23694                 trial = denominator;
23695                 while (trial != 0) {
23696                     UV temp = trial;
23697                     trial = gcd % trial;
23698                     gcd = temp;
23699                 }
23700
23701                 /* If already in lowest possible terms, we have already tried
23702                  * looking this up */
23703                 if (gcd == 1) {
23704                     goto failed;
23705                 }
23706
23707                 /* Reduce the rational, which should put it in canonical form
23708                  * */
23709                 numerator /= gcd;
23710                 denominator /= gcd;
23711
23712                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
23713                         equals_pos, lookup_name, sign, numerator, denominator);
23714             }
23715
23716             /* Here, we have the number in canonical form.  Try that */
23717             table_index = match_uniprop((U8 *) canonical, strlen(canonical));
23718             if (table_index == 0) {
23719                 goto failed;
23720             }
23721         }   /* End of still didn't find the property in our table */
23722     }       /* End of       didn't find the property in our table */
23723
23724     /* Here, we have a non-zero return, which is an index into a table of ptrs.
23725      * A negative return signifies that the real index is the absolute value,
23726      * but the result needs to be inverted */
23727     if (table_index < 0) {
23728         invert_return = TRUE;
23729         table_index = -table_index;
23730     }
23731
23732     /* Out-of band indices indicate a deprecated property.  The proper index is
23733      * modulo it with the table size.  And dividing by the table size yields
23734      * an offset into a table constructed by regen/mk_invlists.pl to contain
23735      * the corresponding warning message */
23736     if (table_index > MAX_UNI_KEYWORD_INDEX) {
23737         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
23738         table_index %= MAX_UNI_KEYWORD_INDEX;
23739         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
23740                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
23741                 (int) name_len, name, deprecated_property_msgs[warning_offset]);
23742     }
23743
23744     /* In a few properties, a different property is used under /i.  These are
23745      * unlikely to change, so are hard-coded here. */
23746     if (to_fold) {
23747         if (   table_index == UNI_XPOSIXUPPER
23748             || table_index == UNI_XPOSIXLOWER
23749             || table_index == UNI_TITLE)
23750         {
23751             table_index = UNI_CASED;
23752         }
23753         else if (   table_index == UNI_UPPERCASELETTER
23754                  || table_index == UNI_LOWERCASELETTER
23755 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
23756                  || table_index == UNI_TITLECASELETTER
23757 #  endif
23758         ) {
23759             table_index = UNI_CASEDLETTER;
23760         }
23761         else if (  table_index == UNI_POSIXUPPER
23762                 || table_index == UNI_POSIXLOWER)
23763         {
23764             table_index = UNI_POSIXALPHA;
23765         }
23766     }
23767
23768     /* Create and return the inversion list */
23769     prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
23770     sv_2mortal(prop_definition);
23771
23772
23773     /* See if there is a private use override to add to this definition */
23774     {
23775         COPHH * hinthash = (IN_PERL_COMPILETIME)
23776                            ? CopHINTHASH_get(&PL_compiling)
23777                            : CopHINTHASH_get(PL_curcop);
23778         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
23779
23780         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
23781
23782             /* See if there is an element in the hints hash for this table */
23783             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
23784             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
23785
23786             if (pos) {
23787                 bool dummy;
23788                 SV * pu_definition;
23789                 SV * pu_invlist;
23790                 SV * expanded_prop_definition =
23791                             sv_2mortal(invlist_clone(prop_definition, NULL));
23792
23793                 /* If so, it's definition is the string from here to the next
23794                  * \a character.  And its format is the same as a user-defined
23795                  * property */
23796                 pos += SvCUR(pu_lookup);
23797                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
23798                 pu_invlist = handle_user_defined_property(lookup_name,
23799                                                           lookup_len,
23800                                                           0, /* Not UTF-8 */
23801                                                           0, /* Not folded */
23802                                                           runtime,
23803                                                           deferrable,
23804                                                           pu_definition,
23805                                                           &dummy,
23806                                                           msg,
23807                                                           level);
23808                 if (TAINT_get) {
23809                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23810                     sv_catpvs(msg, "Insecure private-use override");
23811                     goto append_name_to_msg;
23812                 }
23813
23814                 /* For now, as a safety measure, make sure that it doesn't
23815                  * override non-private use code points */
23816                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
23817
23818                 /* Add it to the list to be returned */
23819                 _invlist_union(prop_definition, pu_invlist,
23820                                &expanded_prop_definition);
23821                 prop_definition = expanded_prop_definition;
23822                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
23823             }
23824         }
23825     }
23826
23827     if (invert_return) {
23828         _invlist_invert(prop_definition);
23829     }
23830     return prop_definition;
23831
23832
23833   failed:
23834     if (non_pkg_begin != 0) {
23835         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23836         sv_catpvs(msg, "Illegal user-defined property name");
23837     }
23838     else {
23839         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23840         sv_catpvs(msg, "Can't find Unicode property definition");
23841     }
23842     /* FALLTHROUGH */
23843
23844   append_name_to_msg:
23845     {
23846         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
23847         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
23848
23849         sv_catpv(msg, prefix);
23850         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23851         sv_catpv(msg, suffix);
23852     }
23853
23854     return NULL;
23855
23856   definition_deferred:
23857
23858     /* Here it could yet to be defined, so defer evaluation of this
23859      * until its needed at runtime.  We need the fully qualified property name
23860      * to avoid ambiguity, and a trailing newline */
23861     if (! fq_name) {
23862         fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23863                                       non_pkg_begin != 0 /* If has "::" */
23864                                );
23865     }
23866     sv_catpvs(fq_name, "\n");
23867
23868     *user_defined_ptr = TRUE;
23869     return fq_name;
23870 }
23871
23872 #endif
23873
23874 /*
23875  * ex: set ts=8 sts=4 sw=4 et:
23876  */