regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #define REG_COMP_C
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 EXTERN_C const struct regexp_engine my_reg_engine;
  81 #else
  82 #  include "regcomp.h"
  83 #endif
  84
  85 #include "dquote_inline.h"
  86 #include "invlist_inline.h"
  87 #include "unicode_constants.h"
  88
  89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  90  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  92  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  95
  96 #ifndef STATIC
  97 #define STATIC  static
  98 #endif
  99
 100 /* this is a chain of data about sub patterns we are processing that
 101    need to be handled separately/specially in study_chunk. Its so
 102    we can simulate recursion without losing state.  */
 103 struct scan_frame;
 104 typedef struct scan_frame {
 105     regnode *last_regnode;      /* last node to process in this frame */
 106     regnode *next_regnode;      /* next node to process when last is reached */
 107     U32 prev_recursed_depth;
 108     I32 stopparen;              /* what stopparen do we use */
 109
 110     struct scan_frame *this_prev_frame; /* this previous frame */
 111     struct scan_frame *prev_frame;      /* previous frame */
 112     struct scan_frame *next_frame;      /* next frame */
 113 } scan_frame;
 114
 115 /* Certain characters are output as a sequence with the first being a
 116  * backslash. */
 117 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
 118
 119
 120 struct RExC_state_t {
 121     U32         flags;                  /* RXf_* are we folding, multilining? */
 122     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 123     char        *precomp;               /* uncompiled string. */
 124     char        *precomp_end;           /* pointer to end of uncompiled string. */
 125     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 126     regexp      *rx;                    /* perl core regexp structure */
 127     regexp_internal     *rxi;           /* internal data for regexp object
 128                                            pprivate field */
 129     char        *start;                 /* Start of input for compile */
 130     char        *end;                   /* End of input for compile */
 131     char        *parse;                 /* Input-scan pointer. */
 132     char        *copy_start;            /* start of copy of input within
 133                                            constructed parse string */
 134     char        *save_copy_start;       /* Provides one level of saving
 135                                            and restoring 'copy_start' */
 136     char        *copy_start_in_input;   /* Position in input string
 137                                            corresponding to copy_start */
 138     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 139     regnode     *emit_start;            /* Start of emitted-code area */
 140     regnode_offset emit;                /* Code-emit pointer */
 141     I32         naughty;                /* How bad is this pattern? */
 142     I32         sawback;                /* Did we see \1, ...? */
 143     U32         seen;
 144     SSize_t     size;                   /* Number of regnode equivalents in
 145                                            pattern */
 146
 147     /* position beyond 'precomp' of the warning message furthest away from
 148      * 'precomp'.  During the parse, no warnings are raised for any problems
 149      * earlier in the parse than this position.  This works if warnings are
 150      * raised the first time a given spot is parsed, and if only one
 151      * independent warning is raised for any given spot */
 152     Size_t      latest_warn_offset;
 153
 154     I32         npar;                   /* Capture buffer count so far in the
 155                                            parse, (OPEN) plus one. ("par" 0 is
 156                                            the whole pattern)*/
 157     I32         total_par;              /* During initial parse, is either 0,
 158                                            or -1; the latter indicating a
 159                                            reparse is needed.  After that pass,
 160                                            it is what 'npar' became after the
 161                                            pass.  Hence, it being > 0 indicates
 162                                            we are in a reparse situation */
 163     I32         nestroot;               /* root parens we are in - used by
 164                                            accept */
 165     I32         seen_zerolen;
 166     regnode_offset *open_parens;        /* offsets to open parens */
 167     regnode_offset *close_parens;       /* offsets to close parens */
 168     I32      parens_buf_size;           /* #slots malloced open/close_parens */
 169     regnode     *end_op;                /* END node in program */
 170     I32         utf8;           /* whether the pattern is utf8 or not */
 171     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 172                                 /* XXX use this for future optimisation of case
 173                                  * where pattern must be upgraded to utf8. */
 174     I32         uni_semantics;  /* If a d charset modifier should use unicode
 175                                    rules, even if the pattern is not in
 176                                    utf8 */
 177     HV          *paren_names;           /* Paren names */
 178
 179     regnode     **recurse;              /* Recurse regops */
 180     I32         recurse_count;          /* Number of recurse regops we have generated */
 181     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 182                                            through */
 183     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 184     I32         in_lookbehind;
 185     I32         in_lookahead;
 186     I32         contains_locale;
 187     I32         override_recoding;
 188     I32         recode_x_to_native;
 189     I32         in_multi_char_class;
 190     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 191                                             within pattern */
 192     int         code_index;             /* next code_blocks[] slot */
 193     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 194     scan_frame *frame_head;
 195     scan_frame *frame_last;
 196     U32         frame_count;
 197     AV         *warn_text;
 198     HV         *unlexed_names;
 199 #ifdef ADD_TO_REGEXEC
 200     char        *starttry;              /* -Dr: where regtry was called. */
 201 #define RExC_starttry   (pRExC_state->starttry)
 202 #endif
 203     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 204 #ifdef DEBUGGING
 205     const char  *lastparse;
 206     I32         lastnum;
 207     AV          *paren_name_list;       /* idx -> name */
 208     U32         study_chunk_recursed_count;
 209     SV          *mysv1;
 210     SV          *mysv2;
 211
 212 #define RExC_lastparse  (pRExC_state->lastparse)
 213 #define RExC_lastnum    (pRExC_state->lastnum)
 214 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 215 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 216 #define RExC_mysv       (pRExC_state->mysv1)
 217 #define RExC_mysv1      (pRExC_state->mysv1)
 218 #define RExC_mysv2      (pRExC_state->mysv2)
 219
 220 #endif
 221     bool        seen_d_op;
 222     bool        strict;
 223     bool        study_started;
 224     bool        in_script_run;
 225     bool        use_BRANCHJ;
 226 };
 227
 228 #define RExC_flags      (pRExC_state->flags)
 229 #define RExC_pm_flags   (pRExC_state->pm_flags)
 230 #define RExC_precomp    (pRExC_state->precomp)
 231 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 232 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 233 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
 234 #define RExC_precomp_end (pRExC_state->precomp_end)
 235 #define RExC_rx_sv      (pRExC_state->rx_sv)
 236 #define RExC_rx         (pRExC_state->rx)
 237 #define RExC_rxi        (pRExC_state->rxi)
 238 #define RExC_start      (pRExC_state->start)
 239 #define RExC_end        (pRExC_state->end)
 240 #define RExC_parse      (pRExC_state->parse)
 241 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 242 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 243 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 244                                                    under /d from /u ? */
 245
 246 #ifdef RE_TRACK_PATTERN_OFFSETS
 247 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 248                                                          others */
 249 #endif
 250 #define RExC_emit       (pRExC_state->emit)
 251 #define RExC_emit_start (pRExC_state->emit_start)
 252 #define RExC_sawback    (pRExC_state->sawback)
 253 #define RExC_seen       (pRExC_state->seen)
 254 #define RExC_size       (pRExC_state->size)
 255 #define RExC_maxlen        (pRExC_state->maxlen)
 256 #define RExC_npar       (pRExC_state->npar)
 257 #define RExC_total_parens       (pRExC_state->total_par)
 258 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
 259 #define RExC_nestroot   (pRExC_state->nestroot)
 260 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 261 #define RExC_utf8       (pRExC_state->utf8)
 262 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 263 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 264 #define RExC_open_parens        (pRExC_state->open_parens)
 265 #define RExC_close_parens       (pRExC_state->close_parens)
 266 #define RExC_end_op     (pRExC_state->end_op)
 267 #define RExC_paren_names        (pRExC_state->paren_names)
 268 #define RExC_recurse    (pRExC_state->recurse)
 269 #define RExC_recurse_count      (pRExC_state->recurse_count)
 270 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 271 #define RExC_study_chunk_recursed_bytes  \
 272                                    (pRExC_state->study_chunk_recursed_bytes)
 273 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 274 #define RExC_in_lookahead       (pRExC_state->in_lookahead)
 275 #define RExC_contains_locale    (pRExC_state->contains_locale)
 276 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 277
 278 #ifdef EBCDIC
 279 #  define SET_recode_x_to_native(x)                                         \
 280                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
 281 #else
 282 #  define SET_recode_x_to_native(x) NOOP
 283 #endif
 284
 285 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 286 #define RExC_frame_head (pRExC_state->frame_head)
 287 #define RExC_frame_last (pRExC_state->frame_last)
 288 #define RExC_frame_count (pRExC_state->frame_count)
 289 #define RExC_strict (pRExC_state->strict)
 290 #define RExC_study_started      (pRExC_state->study_started)
 291 #define RExC_warn_text (pRExC_state->warn_text)
 292 #define RExC_in_script_run      (pRExC_state->in_script_run)
 293 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 294 #define RExC_unlexed_names (pRExC_state->unlexed_names)
 295
 296 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 297  * a flag to disable back-off on the fixed/floating substrings - if it's
 298  * a high complexity pattern we assume the benefit of avoiding a full match
 299  * is worth the cost of checking for the substrings even if they rarely help.
 300  */
 301 #define RExC_naughty    (pRExC_state->naughty)
 302 #define TOO_NAUGHTY (10)
 303 #define MARK_NAUGHTY(add) \
 304     if (RExC_naughty < TOO_NAUGHTY) \
 305         RExC_naughty += (add)
 306 #define MARK_NAUGHTY_EXP(exp, add) \
 307     if (RExC_naughty < TOO_NAUGHTY) \
 308         RExC_naughty += RExC_naughty / (exp) + (add)
 309
 310 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 311 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 312         ((*s) == '{' && regcurly(s)))
 313
 314 /*
 315  * Flags to be passed up and down.
 316  */
 317 #define WORST           0       /* Worst case. */
 318 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 319                                    non-null ones. */
 320
 321 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 322  * character.  (There needs to be a case: in the switch statement in regexec.c
 323  * for any node marked SIMPLE.)  Note that this is not the same thing as
 324  * REGNODE_SIMPLE */
 325 #define SIMPLE          0x02
 326 #define SPSTART         0x04    /* Starts with * or + */
 327 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 328 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 329 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 330 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 331                                    calcuate sizes as UTF-8 */
 332
 333 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 334
 335 /* whether trie related optimizations are enabled */
 336 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 337 #define TRIE_STUDY_OPT
 338 #define FULL_TRIE_STUDY
 339 #define TRIE_STCLASS
 340 #endif
 341
 342
 343
 344 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 345 #define PBITVAL(paren) (1 << ((paren) & 7))
 346 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 347 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 348 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 349
 350 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 351                                      if (!UTF) {                           \
 352                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 353                                          return 0;                         \
 354                                      }                                     \
 355                              } STMT_END
 356
 357 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 358  * a flag that indicates we need to override /d with /u as a result of
 359  * something in the pattern.  It should only be used in regards to calling
 360  * set_regex_charset() or get_regex_charset() */
 361 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 362     STMT_START {                                                            \
 363             if (DEPENDS_SEMANTICS) {                                        \
 364                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 365                 RExC_uni_semantics = 1;                                     \
 366                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
 367                     /* No need to restart the parse if we haven't seen      \
 368                      * anything that differs between /u and /d, and no need \
 369                      * to restart immediately if we're going to reparse     \
 370                      * anyway to count parens */                            \
 371                     *flagp |= RESTART_PARSE;                                \
 372                     return restart_retval;                                  \
 373                 }                                                           \
 374             }                                                               \
 375     } STMT_END
 376
 377 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 378     STMT_START {                                                            \
 379                 RExC_use_BRANCHJ = 1;                                       \
 380                 *flagp |= RESTART_PARSE;                                    \
 381                 return restart_retval;                                      \
 382     } STMT_END
 383
 384 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
 385  * less.  After that, it must always be positive, because the whole re is
 386  * considered to be surrounded by virtual parens.  Setting it to negative
 387  * indicates there is some construct that needs to know the actual number of
 388  * parens to be properly handled.  And that means an extra pass will be
 389  * required after we've counted them all */
 390 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
 391 #define REQUIRE_PARENS_PASS                                                 \
 392     STMT_START {  /* No-op if have completed a pass */                      \
 393                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
 394     } STMT_END
 395 #define IN_PARENS_PASS (RExC_total_parens < 0)
 396
 397
 398 /* This is used to return failure (zero) early from the calling function if
 399  * various flags in 'flags' are set.  Two flags always cause a return:
 400  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 401  * additional flags that should cause a return; 0 if none.  If the return will
 402  * be done, '*flagp' is first set to be all of the flags that caused the
 403  * return. */
 404 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 405     STMT_START {                                                            \
 406             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 407                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 408                 return 0;                                                   \
 409             }                                                               \
 410     } STMT_END
 411
 412 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 413
 414 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 415                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 416 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 417                                     if (MUST_RESTART(*(flagp))) return 0
 418
 419 /* This converts the named class defined in regcomp.h to its equivalent class
 420  * number defined in handy.h. */
 421 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 422 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 423
 424 #define _invlist_union_complement_2nd(a, b, output) \
 425                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 426 #define _invlist_intersection_complement_2nd(a, b, output) \
 427                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 428
 429 /* About scan_data_t.
 430
 431   During optimisation we recurse through the regexp program performing
 432   various inplace (keyhole style) optimisations. In addition study_chunk
 433   and scan_commit populate this data structure with information about
 434   what strings MUST appear in the pattern. We look for the longest
 435   string that must appear at a fixed location, and we look for the
 436   longest string that may appear at a floating location. So for instance
 437   in the pattern:
 438
 439     /FOO[xX]A.*B[xX]BAR/
 440
 441   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 442   strings (because they follow a .* construct). study_chunk will identify
 443   both FOO and BAR as being the longest fixed and floating strings respectively.
 444
 445   The strings can be composites, for instance
 446
 447      /(f)(o)(o)/
 448
 449   will result in a composite fixed substring 'foo'.
 450
 451   For each string some basic information is maintained:
 452
 453   - min_offset
 454     This is the position the string must appear at, or not before.
 455     It also implicitly (when combined with minlenp) tells us how many
 456     characters must match before the string we are searching for.
 457     Likewise when combined with minlenp and the length of the string it
 458     tells us how many characters must appear after the string we have
 459     found.
 460
 461   - max_offset
 462     Only used for floating strings. This is the rightmost point that
 463     the string can appear at. If set to SSize_t_MAX it indicates that the
 464     string can occur infinitely far to the right.
 465     For fixed strings, it is equal to min_offset.
 466
 467   - minlenp
 468     A pointer to the minimum number of characters of the pattern that the
 469     string was found inside. This is important as in the case of positive
 470     lookahead or positive lookbehind we can have multiple patterns
 471     involved. Consider
 472
 473     /(?=FOO).*F/
 474
 475     The minimum length of the pattern overall is 3, the minimum length
 476     of the lookahead part is 3, but the minimum length of the part that
 477     will actually match is 1. So 'FOO's minimum length is 3, but the
 478     minimum length for the F is 1. This is important as the minimum length
 479     is used to determine offsets in front of and behind the string being
 480     looked for.  Since strings can be composites this is the length of the
 481     pattern at the time it was committed with a scan_commit. Note that
 482     the length is calculated by study_chunk, so that the minimum lengths
 483     are not known until the full pattern has been compiled, thus the
 484     pointer to the value.
 485
 486   - lookbehind
 487
 488     In the case of lookbehind the string being searched for can be
 489     offset past the start point of the final matching string.
 490     If this value was just blithely removed from the min_offset it would
 491     invalidate some of the calculations for how many chars must match
 492     before or after (as they are derived from min_offset and minlen and
 493     the length of the string being searched for).
 494     When the final pattern is compiled and the data is moved from the
 495     scan_data_t structure into the regexp structure the information
 496     about lookbehind is factored in, with the information that would
 497     have been lost precalculated in the end_shift field for the
 498     associated string.
 499
 500   The fields pos_min and pos_delta are used to store the minimum offset
 501   and the delta to the maximum offset at the current point in the pattern.
 502
 503 */
 504
 505 struct scan_data_substrs {
 506     SV      *str;       /* longest substring found in pattern */
 507     SSize_t min_offset; /* earliest point in string it can appear */
 508     SSize_t max_offset; /* latest point in string it can appear */
 509     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 510     SSize_t lookbehind; /* is the pos of the string modified by LB */
 511     I32 flags;          /* per substring SF_* and SCF_* flags */
 512 };
 513
 514 typedef struct scan_data_t {
 515     /*I32 len_min;      unused */
 516     /*I32 len_delta;    unused */
 517     SSize_t pos_min;
 518     SSize_t pos_delta;
 519     SV *last_found;
 520     SSize_t last_end;       /* min value, <0 unless valid. */
 521     SSize_t last_start_min;
 522     SSize_t last_start_max;
 523     U8      cur_is_floating; /* whether the last_* values should be set as
 524                               * the next fixed (0) or floating (1)
 525                               * substring */
 526
 527     /* [0] is longest fixed substring so far, [1] is longest float so far */
 528     struct scan_data_substrs  substrs[2];
 529
 530     I32 flags;             /* common SF_* and SCF_* flags */
 531     I32 whilem_c;
 532     SSize_t *last_closep;
 533     regnode_ssc *start_class;
 534 } scan_data_t;
 535
 536 /*
 537  * Forward declarations for pregcomp()'s friends.
 538  */
 539
 540 static const scan_data_t zero_scan_data = {
 541     0, 0, NULL, 0, 0, 0, 0,
 542     {
 543         { NULL, 0, 0, 0, 0, 0 },
 544         { NULL, 0, 0, 0, 0, 0 },
 545     },
 546     0, 0, NULL, NULL
 547 };
 548
 549 /* study flags */
 550
 551 #define SF_BEFORE_SEOL          0x0001
 552 #define SF_BEFORE_MEOL          0x0002
 553 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 554
 555 #define SF_IS_INF               0x0040
 556 #define SF_HAS_PAR              0x0080
 557 #define SF_IN_PAR               0x0100
 558 #define SF_HAS_EVAL             0x0200
 559
 560
 561 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 562  * longest substring in the pattern. When it is not set the optimiser keeps
 563  * track of position, but does not keep track of the actual strings seen,
 564  *
 565  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 566  * /foo/i will not.
 567  *
 568  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 569  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 570  * turned off because of the alternation (BRANCH). */
 571 #define SCF_DO_SUBSTR           0x0400
 572
 573 #define SCF_DO_STCLASS_AND      0x0800
 574 #define SCF_DO_STCLASS_OR       0x1000
 575 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 576 #define SCF_WHILEM_VISITED_POS  0x2000
 577
 578 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 579 #define SCF_SEEN_ACCEPT         0x8000
 580 #define SCF_TRIE_DOING_RESTUDY 0x10000
 581 #define SCF_IN_DEFINE          0x20000
 582
 583
 584
 585
 586 #define UTF cBOOL(RExC_utf8)
 587
 588 /* The enums for all these are ordered so things work out correctly */
 589 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 590 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 591                                                      == REGEX_DEPENDS_CHARSET)
 592 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 593 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 594                                                      >= REGEX_UNICODE_CHARSET)
 595 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 596                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 597 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 598                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 599 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 600                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 601
 602 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 603
 604 /* For programs that want to be strictly Unicode compatible by dying if any
 605  * attempt is made to match a non-Unicode code point against a Unicode
 606  * property.  */
 607 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 608
 609 #define OOB_NAMEDCLASS          -1
 610
 611 /* There is no code point that is out-of-bounds, so this is problematic.  But
 612  * its only current use is to initialize a variable that is always set before
 613  * looked at. */
 614 #define OOB_UNICODE             0xDEADBEEF
 615
 616 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 617
 618
 619 /* length of regex to show in messages that don't mark a position within */
 620 #define RegexLengthToShowInErrorMessages 127
 621
 622 /*
 623  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 624  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 625  * op/pragma/warn/regcomp.
 626  */
 627 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 628 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 629
 630 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 631                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 632
 633 /* The code in this file in places uses one level of recursion with parsing
 634  * rebased to an alternate string constructed by us in memory.  This can take
 635  * the form of something that is completely different from the input, or
 636  * something that uses the input as part of the alternate.  In the first case,
 637  * there should be no possibility of an error, as we are in complete control of
 638  * the alternate string.  But in the second case we don't completely control
 639  * the input portion, so there may be errors in that.  Here's an example:
 640  *      /[abc\x{DF}def]/ui
 641  * is handled specially because \x{df} folds to a sequence of more than one
 642  * character: 'ss'.  What is done is to create and parse an alternate string,
 643  * which looks like this:
 644  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 645  * where it uses the input unchanged in the middle of something it constructs,
 646  * which is a branch for the DF outside the character class, and clustering
 647  * parens around the whole thing. (It knows enough to skip the DF inside the
 648  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 649  * need to be reported.  The general situation looks like this:
 650  *
 651  *                                       |<------- identical ------>|
 652  *              sI                       tI               xI       eI
 653  * Input:       ---------------------------------------------------------------
 654  * Constructed:         ---------------------------------------------------
 655  *                      sC               tC               xC       eC     EC
 656  *                                       |<------- identical ------>|
 657  *
 658  * sI..eI   is the portion of the input pattern we are concerned with here.
 659  * sC..EC   is the constructed substitute parse string.
 660  *  sC..tC  is constructed by us
 661  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 662  *          In the diagram, these are vertically aligned.
 663  *  eC..EC  is also constructed by us.
 664  * xC       is the position in the substitute parse string where we found a
 665  *          problem.
 666  * xI       is the position in the original pattern corresponding to xC.
 667  *
 668  * We want to display a message showing the real input string.  Thus we need to
 669  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 670  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 671  * get:
 672  *      xI = tI + (xC - tC)
 673  *
 674  * When the substitute parse is constructed, the code needs to set:
 675  *      RExC_start (sC)
 676  *      RExC_end (eC)
 677  *      RExC_copy_start_in_input  (tI)
 678  *      RExC_copy_start_in_constructed (tC)
 679  * and restore them when done.
 680  *
 681  * During normal processing of the input pattern, both
 682  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 683  * sI, so that xC equals xI.
 684  */
 685
 686 #define sI              RExC_precomp
 687 #define eI              RExC_precomp_end
 688 #define sC              RExC_start
 689 #define eC              RExC_end
 690 #define tI              RExC_copy_start_in_input
 691 #define tC              RExC_copy_start_in_constructed
 692 #define xI(xC)          (tI + (xC - tC))
 693 #define xI_offset(xC)   (xI(xC) - sI)
 694
 695 #define REPORT_LOCATION_ARGS(xC)                                            \
 696     UTF8fARG(UTF,                                                           \
 697              (xI(xC) > eI) /* Don't run off end */                          \
 698               ? eI - sI   /* Length before the <--HERE */                   \
 699               : ((xI_offset(xC) >= 0)                                       \
 700                  ? xI_offset(xC)                                            \
 701                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 702                                     IVdf " trying to output message for "   \
 703                                     " pattern %.*s",                        \
 704                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 705                                     ((int) (eC - sC)), sC), 0)),            \
 706              sI),         /* The input pattern printed up to the <--HERE */ \
 707     UTF8fARG(UTF,                                                           \
 708              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 709              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 710
 711 /* Used to point after bad bytes for an error message, but avoid skipping
 712  * past a nul byte. */
 713 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 714
 715 /* Set up to clean up after our imminent demise */
 716 #define PREPARE_TO_DIE                                                      \
 717     STMT_START {                                                            \
 718         if (RExC_rx_sv)                                                     \
 719             SAVEFREESV(RExC_rx_sv);                                         \
 720         if (RExC_open_parens)                                               \
 721             SAVEFREEPV(RExC_open_parens);                                   \
 722         if (RExC_close_parens)                                              \
 723             SAVEFREEPV(RExC_close_parens);                                  \
 724     } STMT_END
 725
 726 /*
 727  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 728  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 729  * "...".
 730  */
 731 #define _FAIL(code) STMT_START {                                        \
 732     const char *ellipses = "";                                          \
 733     IV len = RExC_precomp_end - RExC_precomp;                           \
 734                                                                         \
 735     PREPARE_TO_DIE;                                                     \
 736     if (len > RegexLengthToShowInErrorMessages) {                       \
 737         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 738         len = RegexLengthToShowInErrorMessages - 10;                    \
 739         ellipses = "...";                                               \
 740     }                                                                   \
 741     code;                                                               \
 742 } STMT_END
 743
 744 #define FAIL(msg) _FAIL(                            \
 745     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 746             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 747
 748 #define FAIL2(msg,arg) _FAIL(                       \
 749     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 750             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 751
 752 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
 753     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 754      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 755
 756 /*
 757  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 758  */
 759 #define Simple_vFAIL(m) STMT_START {                                    \
 760     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 761             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 762 } STMT_END
 763
 764 /*
 765  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 766  */
 767 #define vFAIL(m) STMT_START {                           \
 768     PREPARE_TO_DIE;                                     \
 769     Simple_vFAIL(m);                                    \
 770 } STMT_END
 771
 772 /*
 773  * Like Simple_vFAIL(), but accepts two arguments.
 774  */
 775 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 776     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 777                       REPORT_LOCATION_ARGS(RExC_parse));        \
 778 } STMT_END
 779
 780 /*
 781  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 782  */
 783 #define vFAIL2(m,a1) STMT_START {                       \
 784     PREPARE_TO_DIE;                                     \
 785     Simple_vFAIL2(m, a1);                               \
 786 } STMT_END
 787
 788
 789 /*
 790  * Like Simple_vFAIL(), but accepts three arguments.
 791  */
 792 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 793     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 794             REPORT_LOCATION_ARGS(RExC_parse));                  \
 795 } STMT_END
 796
 797 /*
 798  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 799  */
 800 #define vFAIL3(m,a1,a2) STMT_START {                    \
 801     PREPARE_TO_DIE;                                     \
 802     Simple_vFAIL3(m, a1, a2);                           \
 803 } STMT_END
 804
 805 /*
 806  * Like Simple_vFAIL(), but accepts four arguments.
 807  */
 808 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 809     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 810             REPORT_LOCATION_ARGS(RExC_parse));                  \
 811 } STMT_END
 812
 813 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 814     PREPARE_TO_DIE;                                     \
 815     Simple_vFAIL4(m, a1, a2, a3);                       \
 816 } STMT_END
 817
 818 /* A specialized version of vFAIL2 that works with UTF8f */
 819 #define vFAIL2utf8f(m, a1) STMT_START {             \
 820     PREPARE_TO_DIE;                                 \
 821     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 822             REPORT_LOCATION_ARGS(RExC_parse));      \
 823 } STMT_END
 824
 825 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 826     PREPARE_TO_DIE;                                     \
 827     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 828             REPORT_LOCATION_ARGS(RExC_parse));          \
 829 } STMT_END
 830
 831 /* Setting this to NULL is a signal to not output warnings */
 832 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
 833     STMT_START {                                                            \
 834       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
 835       RExC_copy_start_in_constructed = NULL;                                \
 836     } STMT_END
 837 #define RESTORE_WARNINGS                                                    \
 838     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
 839
 840 /* Since a warning can be generated multiple times as the input is reparsed, we
 841  * output it the first time we come to that point in the parse, but suppress it
 842  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 843  * generate any warnings */
 844 #define TO_OUTPUT_WARNINGS(loc)                                         \
 845   (   RExC_copy_start_in_constructed                                    \
 846    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 847
 848 /* After we've emitted a warning, we save the position in the input so we don't
 849  * output it again */
 850 #define UPDATE_WARNINGS_LOC(loc)                                        \
 851     STMT_START {                                                        \
 852         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 853             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
 854                                                        - RExC_precomp;  \
 855         }                                                               \
 856     } STMT_END
 857
 858 /* 'warns' is the output of the packWARNx macro used in 'code' */
 859 #define _WARN_HELPER(loc, warns, code)                                  \
 860     STMT_START {                                                        \
 861         if (! RExC_copy_start_in_constructed) {                         \
 862             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 863                               " expected at '%s'",                      \
 864                               __FILE__, __LINE__, loc);                 \
 865         }                                                               \
 866         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 867             if (ckDEAD(warns))                                          \
 868                 PREPARE_TO_DIE;                                         \
 869             code;                                                       \
 870             UPDATE_WARNINGS_LOC(loc);                                   \
 871         }                                                               \
 872     } STMT_END
 873
 874 /* m is not necessarily a "literal string", in this macro */
 875 #define reg_warn_non_literal_string(loc, m)                             \
 876     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 877                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 878                                        "%s" REPORT_LOCATION,            \
 879                                   m, REPORT_LOCATION_ARGS(loc)))
 880
 881 #define ckWARNreg(loc,m)                                                \
 882     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 883                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 884                                           m REPORT_LOCATION,            \
 885                                           REPORT_LOCATION_ARGS(loc)))
 886
 887 #define vWARN(loc, m)                                                   \
 888     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 889                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 890                                        m REPORT_LOCATION,               \
 891                                        REPORT_LOCATION_ARGS(loc)))      \
 892
 893 #define vWARN_dep(loc, m)                                               \
 894     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 895                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 896                                        m REPORT_LOCATION,               \
 897                                        REPORT_LOCATION_ARGS(loc)))
 898
 899 #define ckWARNdep(loc,m)                                                \
 900     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 901                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 902                                             m REPORT_LOCATION,          \
 903                                             REPORT_LOCATION_ARGS(loc)))
 904
 905 #define ckWARNregdep(loc,m)                                                 \
 906     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
 907                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
 908                                                       WARN_REGEXP),         \
 909                                              m REPORT_LOCATION,             \
 910                                              REPORT_LOCATION_ARGS(loc)))
 911
 912 #define ckWARN2reg_d(loc,m, a1)                                             \
 913     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 914                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
 915                                             m REPORT_LOCATION,              \
 916                                             a1, REPORT_LOCATION_ARGS(loc)))
 917
 918 #define ckWARN2reg(loc, m, a1)                                              \
 919     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 920                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 921                                           m REPORT_LOCATION,                \
 922                                           a1, REPORT_LOCATION_ARGS(loc)))
 923
 924 #define vWARN3(loc, m, a1, a2)                                              \
 925     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 926                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
 927                                        m REPORT_LOCATION,                   \
 928                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
 929
 930 #define ckWARN3reg(loc, m, a1, a2)                                          \
 931     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 932                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 933                                           m REPORT_LOCATION,                \
 934                                           a1, a2,                           \
 935                                           REPORT_LOCATION_ARGS(loc)))
 936
 937 #define vWARN4(loc, m, a1, a2, a3)                                      \
 938     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 939                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 940                                        m REPORT_LOCATION,               \
 941                                        a1, a2, a3,                      \
 942                                        REPORT_LOCATION_ARGS(loc)))
 943
 944 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
 945     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 946                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 947                                           m REPORT_LOCATION,            \
 948                                           a1, a2, a3,                   \
 949                                           REPORT_LOCATION_ARGS(loc)))
 950
 951 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
 952     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 953                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 954                                        m REPORT_LOCATION,               \
 955                                        a1, a2, a3, a4,                  \
 956                                        REPORT_LOCATION_ARGS(loc)))
 957
 958 #define ckWARNexperimental(loc, class, m)                               \
 959     _WARN_HELPER(loc, packWARN(class),                                  \
 960                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
 961                                             m REPORT_LOCATION,          \
 962                                             REPORT_LOCATION_ARGS(loc)))
 963
 964 /* Convert between a pointer to a node and its offset from the beginning of the
 965  * program */
 966 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
 967 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
 968
 969 /* Macros for recording node offsets.   20001227 mjd@plover.com
 970  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 971  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 972  * Element 0 holds the number n.
 973  * Position is 1 indexed.
 974  */
 975 #ifndef RE_TRACK_PATTERN_OFFSETS
 976 #define Set_Node_Offset_To_R(offset,byte)
 977 #define Set_Node_Offset(node,byte)
 978 #define Set_Cur_Node_Offset
 979 #define Set_Node_Length_To_R(node,len)
 980 #define Set_Node_Length(node,len)
 981 #define Set_Node_Cur_Length(node,start)
 982 #define Node_Offset(n)
 983 #define Node_Length(n)
 984 #define Set_Node_Offset_Length(node,offset,len)
 985 #define ProgLen(ri) ri->u.proglen
 986 #define SetProgLen(ri,x) ri->u.proglen = x
 987 #define Track_Code(code)
 988 #else
 989 #define ProgLen(ri) ri->u.offsets[0]
 990 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 991 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
 992         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 993                     __LINE__, (int)(offset), (int)(byte)));             \
 994         if((offset) < 0) {                                              \
 995             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 996                                          (int)(offset));                \
 997         } else {                                                        \
 998             RExC_offsets[2*(offset)-1] = (byte);                        \
 999         }                                                               \
1000 } STMT_END
1001
1002 #define Set_Node_Offset(node,byte)                                      \
1003     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1004 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1005
1006 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
1007         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
1008                 __LINE__, (int)(node), (int)(len)));                    \
1009         if((node) < 0) {                                                \
1010             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1011                                          (int)(node));                  \
1012         } else {                                                        \
1013             RExC_offsets[2*(node)] = (len);                             \
1014         }                                                               \
1015 } STMT_END
1016
1017 #define Set_Node_Length(node,len) \
1018     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1019 #define Set_Node_Cur_Length(node, start)                \
1020     Set_Node_Length(node, RExC_parse - start)
1021
1022 /* Get offsets and lengths */
1023 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1024 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1025
1026 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1027     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1028     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1029 } STMT_END
1030
1031 #define Track_Code(code) STMT_START { code } STMT_END
1032 #endif
1033
1034 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1035 #define EXPERIMENTAL_INPLACESCAN
1036 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1037
1038 #ifdef DEBUGGING
1039 int
1040 Perl_re_printf(pTHX_ const char *fmt, ...)
1041 {
1042     va_list ap;
1043     int result;
1044     PerlIO *f= Perl_debug_log;
1045     PERL_ARGS_ASSERT_RE_PRINTF;
1046     va_start(ap, fmt);
1047     result = PerlIO_vprintf(f, fmt, ap);
1048     va_end(ap);
1049     return result;
1050 }
1051
1052 int
1053 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1054 {
1055     va_list ap;
1056     int result;
1057     PerlIO *f= Perl_debug_log;
1058     PERL_ARGS_ASSERT_RE_INDENTF;
1059     va_start(ap, depth);
1060     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1061     result = PerlIO_vprintf(f, fmt, ap);
1062     va_end(ap);
1063     return result;
1064 }
1065 #endif /* DEBUGGING */
1066
1067 #define DEBUG_RExC_seen()                                                   \
1068         DEBUG_OPTIMISE_MORE_r({                                             \
1069             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1070                                                                             \
1071             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1072                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1073                                                                             \
1074             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1075                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1076                                                                             \
1077             if (RExC_seen & REG_GPOS_SEEN)                                  \
1078                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1079                                                                             \
1080             if (RExC_seen & REG_RECURSE_SEEN)                               \
1081                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1082                                                                             \
1083             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1084                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1085                                                                             \
1086             if (RExC_seen & REG_VERBARG_SEEN)                               \
1087                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1088                                                                             \
1089             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1090                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1091                                                                             \
1092             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1093                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1094                                                                             \
1095             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1096                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1097                                                                             \
1098             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1099                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1100                                                                             \
1101             Perl_re_printf( aTHX_ "\n");                                    \
1102         });
1103
1104 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1105   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1106
1107
1108 #ifdef DEBUGGING
1109 static void
1110 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1111                                     const char *close_str)
1112 {
1113     if (!flags)
1114         return;
1115
1116     Perl_re_printf( aTHX_  "%s", open_str);
1117     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1118     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1119     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1120     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1121     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1122     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1123     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1124     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1125     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1126     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1127     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1128     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1129     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1130     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1131     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1132     Perl_re_printf( aTHX_  "%s", close_str);
1133 }
1134
1135
1136 static void
1137 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1138                     U32 depth, int is_inf)
1139 {
1140     GET_RE_DEBUG_FLAGS_DECL;
1141
1142     DEBUG_OPTIMISE_MORE_r({
1143         if (!data)
1144             return;
1145         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1146             depth,
1147             where,
1148             (IV)data->pos_min,
1149             (IV)data->pos_delta,
1150             (UV)data->flags
1151         );
1152
1153         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1154
1155         Perl_re_printf( aTHX_
1156             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1157             (IV)data->whilem_c,
1158             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1159             is_inf ? "INF " : ""
1160         );
1161
1162         if (data->last_found) {
1163             int i;
1164             Perl_re_printf(aTHX_
1165                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1166                     SvPVX_const(data->last_found),
1167                     (IV)data->last_end,
1168                     (IV)data->last_start_min,
1169                     (IV)data->last_start_max
1170             );
1171
1172             for (i = 0; i < 2; i++) {
1173                 Perl_re_printf(aTHX_
1174                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1175                     data->cur_is_floating == i ? "*" : "",
1176                     i ? "Float" : "Fixed",
1177                     SvPVX_const(data->substrs[i].str),
1178                     (IV)data->substrs[i].min_offset,
1179                     (IV)data->substrs[i].max_offset
1180                 );
1181                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1182             }
1183         }
1184
1185         Perl_re_printf( aTHX_ "\n");
1186     });
1187 }
1188
1189
1190 static void
1191 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1192                 regnode *scan, U32 depth, U32 flags)
1193 {
1194     GET_RE_DEBUG_FLAGS_DECL;
1195
1196     DEBUG_OPTIMISE_r({
1197         regnode *Next;
1198
1199         if (!scan)
1200             return;
1201         Next = regnext(scan);
1202         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1203         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1204             depth,
1205             str,
1206             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1207             Next ? (REG_NODE_NUM(Next)) : 0 );
1208         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1209         Perl_re_printf( aTHX_  "\n");
1210    });
1211 }
1212
1213
1214 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1215                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1216
1217 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1218                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1219
1220 #else
1221 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1222 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1223 #endif
1224
1225
1226 /* =========================================================
1227  * BEGIN edit_distance stuff.
1228  *
1229  * This calculates how many single character changes of any type are needed to
1230  * transform a string into another one.  It is taken from version 3.1 of
1231  *
1232  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1233  */
1234
1235 /* Our unsorted dictionary linked list.   */
1236 /* Note we use UVs, not chars. */
1237
1238 struct dictionary{
1239   UV key;
1240   UV value;
1241   struct dictionary* next;
1242 };
1243 typedef struct dictionary item;
1244
1245
1246 PERL_STATIC_INLINE item*
1247 push(UV key, item* curr)
1248 {
1249     item* head;
1250     Newx(head, 1, item);
1251     head->key = key;
1252     head->value = 0;
1253     head->next = curr;
1254     return head;
1255 }
1256
1257
1258 PERL_STATIC_INLINE item*
1259 find(item* head, UV key)
1260 {
1261     item* iterator = head;
1262     while (iterator){
1263         if (iterator->key == key){
1264             return iterator;
1265         }
1266         iterator = iterator->next;
1267     }
1268
1269     return NULL;
1270 }
1271
1272 PERL_STATIC_INLINE item*
1273 uniquePush(item* head, UV key)
1274 {
1275     item* iterator = head;
1276
1277     while (iterator){
1278         if (iterator->key == key) {
1279             return head;
1280         }
1281         iterator = iterator->next;
1282     }
1283
1284     return push(key, head);
1285 }
1286
1287 PERL_STATIC_INLINE void
1288 dict_free(item* head)
1289 {
1290     item* iterator = head;
1291
1292     while (iterator) {
1293         item* temp = iterator;
1294         iterator = iterator->next;
1295         Safefree(temp);
1296     }
1297
1298     head = NULL;
1299 }
1300
1301 /* End of Dictionary Stuff */
1302
1303 /* All calculations/work are done here */
1304 STATIC int
1305 S_edit_distance(const UV* src,
1306                 const UV* tgt,
1307                 const STRLEN x,             /* length of src[] */
1308                 const STRLEN y,             /* length of tgt[] */
1309                 const SSize_t maxDistance
1310 )
1311 {
1312     item *head = NULL;
1313     UV swapCount, swapScore, targetCharCount, i, j;
1314     UV *scores;
1315     UV score_ceil = x + y;
1316
1317     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1318
1319     /* intialize matrix start values */
1320     Newx(scores, ( (x + 2) * (y + 2)), UV);
1321     scores[0] = score_ceil;
1322     scores[1 * (y + 2) + 0] = score_ceil;
1323     scores[0 * (y + 2) + 1] = score_ceil;
1324     scores[1 * (y + 2) + 1] = 0;
1325     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1326
1327     /* work loops    */
1328     /* i = src index */
1329     /* j = tgt index */
1330     for (i=1;i<=x;i++) {
1331         if (i < x)
1332             head = uniquePush(head, src[i]);
1333         scores[(i+1) * (y + 2) + 1] = i;
1334         scores[(i+1) * (y + 2) + 0] = score_ceil;
1335         swapCount = 0;
1336
1337         for (j=1;j<=y;j++) {
1338             if (i == 1) {
1339                 if(j < y)
1340                 head = uniquePush(head, tgt[j]);
1341                 scores[1 * (y + 2) + (j + 1)] = j;
1342                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1343             }
1344
1345             targetCharCount = find(head, tgt[j-1])->value;
1346             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1347
1348             if (src[i-1] != tgt[j-1]){
1349                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1350             }
1351             else {
1352                 swapCount = j;
1353                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1354             }
1355         }
1356
1357         find(head, src[i-1])->value = i;
1358     }
1359
1360     {
1361         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1362         dict_free(head);
1363         Safefree(scores);
1364         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1365     }
1366 }
1367
1368 /* END of edit_distance() stuff
1369  * ========================================================= */
1370
1371 /* is c a control character for which we have a mnemonic? */
1372 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1373
1374 STATIC const char *
1375 S_cntrl_to_mnemonic(const U8 c)
1376 {
1377     /* Returns the mnemonic string that represents character 'c', if one
1378      * exists; NULL otherwise.  The only ones that exist for the purposes of
1379      * this routine are a few control characters */
1380
1381     switch (c) {
1382         case '\a':       return "\\a";
1383         case '\b':       return "\\b";
1384         case ESC_NATIVE: return "\\e";
1385         case '\f':       return "\\f";
1386         case '\n':       return "\\n";
1387         case '\r':       return "\\r";
1388         case '\t':       return "\\t";
1389     }
1390
1391     return NULL;
1392 }
1393
1394 /* Mark that we cannot extend a found fixed substring at this point.
1395    Update the longest found anchored substring or the longest found
1396    floating substrings if needed. */
1397
1398 STATIC void
1399 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1400                     SSize_t *minlenp, int is_inf)
1401 {
1402     const STRLEN l = CHR_SVLEN(data->last_found);
1403     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1404     const STRLEN old_l = CHR_SVLEN(longest_sv);
1405     GET_RE_DEBUG_FLAGS_DECL;
1406
1407     PERL_ARGS_ASSERT_SCAN_COMMIT;
1408
1409     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1410         const U8 i = data->cur_is_floating;
1411         SvSetMagicSV(longest_sv, data->last_found);
1412         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1413
1414         if (!i) /* fixed */
1415             data->substrs[0].max_offset = data->substrs[0].min_offset;
1416         else { /* float */
1417             data->substrs[1].max_offset = (l
1418                           ? data->last_start_max
1419                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1420                                          ? SSize_t_MAX
1421                                          : data->pos_min + data->pos_delta));
1422             if (is_inf
1423                  || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1424                 data->substrs[1].max_offset = SSize_t_MAX;
1425         }
1426
1427         if (data->flags & SF_BEFORE_EOL)
1428             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1429         else
1430             data->substrs[i].flags &= ~SF_BEFORE_EOL;
1431         data->substrs[i].minlenp = minlenp;
1432         data->substrs[i].lookbehind = 0;
1433     }
1434
1435     SvCUR_set(data->last_found, 0);
1436     {
1437         SV * const sv = data->last_found;
1438         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1439             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1440             if (mg)
1441                 mg->mg_len = 0;
1442         }
1443     }
1444     data->last_end = -1;
1445     data->flags &= ~SF_BEFORE_EOL;
1446     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1447 }
1448
1449 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1450  * list that describes which code points it matches */
1451
1452 STATIC void
1453 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1454 {
1455     /* Set the SSC 'ssc' to match an empty string or any code point */
1456
1457     PERL_ARGS_ASSERT_SSC_ANYTHING;
1458
1459     assert(is_ANYOF_SYNTHETIC(ssc));
1460
1461     /* mortalize so won't leak */
1462     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1463     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1464 }
1465
1466 STATIC int
1467 S_ssc_is_anything(const regnode_ssc *ssc)
1468 {
1469     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1470      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1471      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1472      * in any way, so there's no point in using it */
1473
1474     UV start, end;
1475     bool ret;
1476
1477     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1478
1479     assert(is_ANYOF_SYNTHETIC(ssc));
1480
1481     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1482         return FALSE;
1483     }
1484
1485     /* See if the list consists solely of the range 0 - Infinity */
1486     invlist_iterinit(ssc->invlist);
1487     ret = invlist_iternext(ssc->invlist, &start, &end)
1488           && start == 0
1489           && end == UV_MAX;
1490
1491     invlist_iterfinish(ssc->invlist);
1492
1493     if (ret) {
1494         return TRUE;
1495     }
1496
1497     /* If e.g., both \w and \W are set, matches everything */
1498     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1499         int i;
1500         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1501             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1502                 return TRUE;
1503             }
1504         }
1505     }
1506
1507     return FALSE;
1508 }
1509
1510 STATIC void
1511 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1512 {
1513     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1514      * string, any code point, or any posix class under locale */
1515
1516     PERL_ARGS_ASSERT_SSC_INIT;
1517
1518     Zero(ssc, 1, regnode_ssc);
1519     set_ANYOF_SYNTHETIC(ssc);
1520     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1521     ssc_anything(ssc);
1522
1523     /* If any portion of the regex is to operate under locale rules that aren't
1524      * fully known at compile time, initialization includes it.  The reason
1525      * this isn't done for all regexes is that the optimizer was written under
1526      * the assumption that locale was all-or-nothing.  Given the complexity and
1527      * lack of documentation in the optimizer, and that there are inadequate
1528      * test cases for locale, many parts of it may not work properly, it is
1529      * safest to avoid locale unless necessary. */
1530     if (RExC_contains_locale) {
1531         ANYOF_POSIXL_SETALL(ssc);
1532     }
1533     else {
1534         ANYOF_POSIXL_ZERO(ssc);
1535     }
1536 }
1537
1538 STATIC int
1539 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1540                         const regnode_ssc *ssc)
1541 {
1542     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1543      * to the list of code points matched, and locale posix classes; hence does
1544      * not check its flags) */
1545
1546     UV start, end;
1547     bool ret;
1548
1549     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1550
1551     assert(is_ANYOF_SYNTHETIC(ssc));
1552
1553     invlist_iterinit(ssc->invlist);
1554     ret = invlist_iternext(ssc->invlist, &start, &end)
1555           && start == 0
1556           && end == UV_MAX;
1557
1558     invlist_iterfinish(ssc->invlist);
1559
1560     if (! ret) {
1561         return FALSE;
1562     }
1563
1564     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1565         return FALSE;
1566     }
1567
1568     return TRUE;
1569 }
1570
1571 #define INVLIST_INDEX 0
1572 #define ONLY_LOCALE_MATCHES_INDEX 1
1573 #define DEFERRED_USER_DEFINED_INDEX 2
1574
1575 STATIC SV*
1576 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1577                                const regnode_charclass* const node)
1578 {
1579     /* Returns a mortal inversion list defining which code points are matched
1580      * by 'node', which is of type ANYOF.  Handles complementing the result if
1581      * appropriate.  If some code points aren't knowable at this time, the
1582      * returned list must, and will, contain every code point that is a
1583      * possibility. */
1584
1585     dVAR;
1586     SV* invlist = NULL;
1587     SV* only_utf8_locale_invlist = NULL;
1588     unsigned int i;
1589     const U32 n = ARG(node);
1590     bool new_node_has_latin1 = FALSE;
1591     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1592                       ? 0
1593                       : ANYOF_FLAGS(node);
1594
1595     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1596
1597     /* Look at the data structure created by S_set_ANYOF_arg() */
1598     if (n != ANYOF_ONLY_HAS_BITMAP) {
1599         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1600         AV * const av = MUTABLE_AV(SvRV(rv));
1601         SV **const ary = AvARRAY(av);
1602         assert(RExC_rxi->data->what[n] == 's');
1603
1604         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1605
1606             /* Here there are things that won't be known until runtime -- we
1607              * have to assume it could be anything */
1608             invlist = sv_2mortal(_new_invlist(1));
1609             return _add_range_to_invlist(invlist, 0, UV_MAX);
1610         }
1611         else if (ary[INVLIST_INDEX]) {
1612
1613             /* Use the node's inversion list */
1614             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1615         }
1616
1617         /* Get the code points valid only under UTF-8 locales */
1618         if (   (flags & ANYOFL_FOLD)
1619             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1620         {
1621             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1622         }
1623     }
1624
1625     if (! invlist) {
1626         invlist = sv_2mortal(_new_invlist(0));
1627     }
1628
1629     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1630      * code points, and an inversion list for the others, but if there are code
1631      * points that should match only conditionally on the target string being
1632      * UTF-8, those are placed in the inversion list, and not the bitmap.
1633      * Since there are circumstances under which they could match, they are
1634      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1635      * to exclude them here, so that when we invert below, the end result
1636      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1637      * have to do this here before we add the unconditionally matched code
1638      * points */
1639     if (flags & ANYOF_INVERT) {
1640         _invlist_intersection_complement_2nd(invlist,
1641                                              PL_UpperLatin1,
1642                                              &invlist);
1643     }
1644
1645     /* Add in the points from the bit map */
1646     if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1647         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1648             if (ANYOF_BITMAP_TEST(node, i)) {
1649                 unsigned int start = i++;
1650
1651                 for (;    i < NUM_ANYOF_CODE_POINTS
1652                        && ANYOF_BITMAP_TEST(node, i); ++i)
1653                 {
1654                     /* empty */
1655                 }
1656                 invlist = _add_range_to_invlist(invlist, start, i-1);
1657                 new_node_has_latin1 = TRUE;
1658             }
1659         }
1660     }
1661
1662     /* If this can match all upper Latin1 code points, have to add them
1663      * as well.  But don't add them if inverting, as when that gets done below,
1664      * it would exclude all these characters, including the ones it shouldn't
1665      * that were added just above */
1666     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1667         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1668     {
1669         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1670     }
1671
1672     /* Similarly for these */
1673     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1674         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1675     }
1676
1677     if (flags & ANYOF_INVERT) {
1678         _invlist_invert(invlist);
1679     }
1680     else if (flags & ANYOFL_FOLD) {
1681         if (new_node_has_latin1) {
1682
1683             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1684              * the locale.  We can skip this if there are no 0-255 at all. */
1685             _invlist_union(invlist, PL_Latin1, &invlist);
1686
1687             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1688             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1689         }
1690         else {
1691             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1692                 invlist = add_cp_to_invlist(invlist, 'I');
1693             }
1694             if (_invlist_contains_cp(invlist,
1695                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1696             {
1697                 invlist = add_cp_to_invlist(invlist, 'i');
1698             }
1699         }
1700     }
1701
1702     /* Similarly add the UTF-8 locale possible matches.  These have to be
1703      * deferred until after the non-UTF-8 locale ones are taken care of just
1704      * above, or it leads to wrong results under ANYOF_INVERT */
1705     if (only_utf8_locale_invlist) {
1706         _invlist_union_maybe_complement_2nd(invlist,
1707                                             only_utf8_locale_invlist,
1708                                             flags & ANYOF_INVERT,
1709                                             &invlist);
1710     }
1711
1712     return invlist;
1713 }
1714
1715 /* These two functions currently do the exact same thing */
1716 #define ssc_init_zero           ssc_init
1717
1718 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1719 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1720
1721 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1722  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1723  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1724
1725 STATIC void
1726 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1727                 const regnode_charclass *and_with)
1728 {
1729     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1730      * another SSC or a regular ANYOF class.  Can create false positives. */
1731
1732     SV* anded_cp_list;
1733     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1734                           ? 0
1735                           : ANYOF_FLAGS(and_with);
1736     U8  anded_flags;
1737
1738     PERL_ARGS_ASSERT_SSC_AND;
1739
1740     assert(is_ANYOF_SYNTHETIC(ssc));
1741
1742     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1743      * the code point inversion list and just the relevant flags */
1744     if (is_ANYOF_SYNTHETIC(and_with)) {
1745         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1746         anded_flags = and_with_flags;
1747
1748         /* XXX This is a kludge around what appears to be deficiencies in the
1749          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1750          * there are paths through the optimizer where it doesn't get weeded
1751          * out when it should.  And if we don't make some extra provision for
1752          * it like the code just below, it doesn't get added when it should.
1753          * This solution is to add it only when AND'ing, which is here, and
1754          * only when what is being AND'ed is the pristine, original node
1755          * matching anything.  Thus it is like adding it to ssc_anything() but
1756          * only when the result is to be AND'ed.  Probably the same solution
1757          * could be adopted for the same problem we have with /l matching,
1758          * which is solved differently in S_ssc_init(), and that would lead to
1759          * fewer false positives than that solution has.  But if this solution
1760          * creates bugs, the consequences are only that a warning isn't raised
1761          * that should be; while the consequences for having /l bugs is
1762          * incorrect matches */
1763         if (ssc_is_anything((regnode_ssc *)and_with)) {
1764             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1765         }
1766     }
1767     else {
1768         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1769         if (OP(and_with) == ANYOFD) {
1770             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1771         }
1772         else {
1773             anded_flags = and_with_flags
1774             &( ANYOF_COMMON_FLAGS
1775               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1776               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1777             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1778                 anded_flags &=
1779                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1780             }
1781         }
1782     }
1783
1784     ANYOF_FLAGS(ssc) &= anded_flags;
1785
1786     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1787      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1788      * 'and_with' may be inverted.  When not inverted, we have the situation of
1789      * computing:
1790      *  (C1 | P1) & (C2 | P2)
1791      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1792      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1793      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1794      *                    <=  ((C1 & C2) | P1 | P2)
1795      * Alternatively, the last few steps could be:
1796      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1797      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1798      *                    <=  (C1 | C2 | (P1 & P2))
1799      * We favor the second approach if either P1 or P2 is non-empty.  This is
1800      * because these components are a barrier to doing optimizations, as what
1801      * they match cannot be known until the moment of matching as they are
1802      * dependent on the current locale, 'AND"ing them likely will reduce or
1803      * eliminate them.
1804      * But we can do better if we know that C1,P1 are in their initial state (a
1805      * frequent occurrence), each matching everything:
1806      *  (<everything>) & (C2 | P2) =  C2 | P2
1807      * Similarly, if C2,P2 are in their initial state (again a frequent
1808      * occurrence), the result is a no-op
1809      *  (C1 | P1) & (<everything>) =  C1 | P1
1810      *
1811      * Inverted, we have
1812      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1813      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1814      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1815      * */
1816
1817     if ((and_with_flags & ANYOF_INVERT)
1818         && ! is_ANYOF_SYNTHETIC(and_with))
1819     {
1820         unsigned int i;
1821
1822         ssc_intersection(ssc,
1823                          anded_cp_list,
1824                          FALSE /* Has already been inverted */
1825                          );
1826
1827         /* If either P1 or P2 is empty, the intersection will be also; can skip
1828          * the loop */
1829         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1830             ANYOF_POSIXL_ZERO(ssc);
1831         }
1832         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1833
1834             /* Note that the Posix class component P from 'and_with' actually
1835              * looks like:
1836              *      P = Pa | Pb | ... | Pn
1837              * where each component is one posix class, such as in [\w\s].
1838              * Thus
1839              *      ~P = ~(Pa | Pb | ... | Pn)
1840              *         = ~Pa & ~Pb & ... & ~Pn
1841              *        <= ~Pa | ~Pb | ... | ~Pn
1842              * The last is something we can easily calculate, but unfortunately
1843              * is likely to have many false positives.  We could do better
1844              * in some (but certainly not all) instances if two classes in
1845              * P have known relationships.  For example
1846              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1847              * So
1848              *      :lower: & :print: = :lower:
1849              * And similarly for classes that must be disjoint.  For example,
1850              * since \s and \w can have no elements in common based on rules in
1851              * the POSIX standard,
1852              *      \w & ^\S = nothing
1853              * Unfortunately, some vendor locales do not meet the Posix
1854              * standard, in particular almost everything by Microsoft.
1855              * The loop below just changes e.g., \w into \W and vice versa */
1856
1857             regnode_charclass_posixl temp;
1858             int add = 1;    /* To calculate the index of the complement */
1859
1860             Zero(&temp, 1, regnode_charclass_posixl);
1861             ANYOF_POSIXL_ZERO(&temp);
1862             for (i = 0; i < ANYOF_MAX; i++) {
1863                 assert(i % 2 != 0
1864                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1865                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1866
1867                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1868                     ANYOF_POSIXL_SET(&temp, i + add);
1869                 }
1870                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1871             }
1872             ANYOF_POSIXL_AND(&temp, ssc);
1873
1874         } /* else ssc already has no posixes */
1875     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1876          in its initial state */
1877     else if (! is_ANYOF_SYNTHETIC(and_with)
1878              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1879     {
1880         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1881          * copy it over 'ssc' */
1882         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1883             if (is_ANYOF_SYNTHETIC(and_with)) {
1884                 StructCopy(and_with, ssc, regnode_ssc);
1885             }
1886             else {
1887                 ssc->invlist = anded_cp_list;
1888                 ANYOF_POSIXL_ZERO(ssc);
1889                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1890                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1891                 }
1892             }
1893         }
1894         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1895                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1896         {
1897             /* One or the other of P1, P2 is non-empty. */
1898             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1899                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1900             }
1901             ssc_union(ssc, anded_cp_list, FALSE);
1902         }
1903         else { /* P1 = P2 = empty */
1904             ssc_intersection(ssc, anded_cp_list, FALSE);
1905         }
1906     }
1907 }
1908
1909 STATIC void
1910 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1911                const regnode_charclass *or_with)
1912 {
1913     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1914      * another SSC or a regular ANYOF class.  Can create false positives if
1915      * 'or_with' is to be inverted. */
1916
1917     SV* ored_cp_list;
1918     U8 ored_flags;
1919     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1920                          ? 0
1921                          : ANYOF_FLAGS(or_with);
1922
1923     PERL_ARGS_ASSERT_SSC_OR;
1924
1925     assert(is_ANYOF_SYNTHETIC(ssc));
1926
1927     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1928      * the code point inversion list and just the relevant flags */
1929     if (is_ANYOF_SYNTHETIC(or_with)) {
1930         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1931         ored_flags = or_with_flags;
1932     }
1933     else {
1934         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1935         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
1936         if (OP(or_with) != ANYOFD) {
1937             ored_flags
1938             |= or_with_flags
1939              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1940                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1941             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
1942                 ored_flags |=
1943                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1944             }
1945         }
1946     }
1947
1948     ANYOF_FLAGS(ssc) |= ored_flags;
1949
1950     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1951      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1952      * 'or_with' may be inverted.  When not inverted, we have the simple
1953      * situation of computing:
1954      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1955      * If P1|P2 yields a situation with both a class and its complement are
1956      * set, like having both \w and \W, this matches all code points, and we
1957      * can delete these from the P component of the ssc going forward.  XXX We
1958      * might be able to delete all the P components, but I (khw) am not certain
1959      * about this, and it is better to be safe.
1960      *
1961      * Inverted, we have
1962      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1963      *                         <=  (C1 | P1) | ~C2
1964      *                         <=  (C1 | ~C2) | P1
1965      * (which results in actually simpler code than the non-inverted case)
1966      * */
1967
1968     if ((or_with_flags & ANYOF_INVERT)
1969         && ! is_ANYOF_SYNTHETIC(or_with))
1970     {
1971         /* We ignore P2, leaving P1 going forward */
1972     }   /* else  Not inverted */
1973     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
1974         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1975         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1976             unsigned int i;
1977             for (i = 0; i < ANYOF_MAX; i += 2) {
1978                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1979                 {
1980                     ssc_match_all_cp(ssc);
1981                     ANYOF_POSIXL_CLEAR(ssc, i);
1982                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1983                 }
1984             }
1985         }
1986     }
1987
1988     ssc_union(ssc,
1989               ored_cp_list,
1990               FALSE /* Already has been inverted */
1991               );
1992 }
1993
1994 PERL_STATIC_INLINE void
1995 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1996 {
1997     PERL_ARGS_ASSERT_SSC_UNION;
1998
1999     assert(is_ANYOF_SYNTHETIC(ssc));
2000
2001     _invlist_union_maybe_complement_2nd(ssc->invlist,
2002                                         invlist,
2003                                         invert2nd,
2004                                         &ssc->invlist);
2005 }
2006
2007 PERL_STATIC_INLINE void
2008 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2009                          SV* const invlist,
2010                          const bool invert2nd)
2011 {
2012     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2013
2014     assert(is_ANYOF_SYNTHETIC(ssc));
2015
2016     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2017                                                invlist,
2018                                                invert2nd,
2019                                                &ssc->invlist);
2020 }
2021
2022 PERL_STATIC_INLINE void
2023 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2024 {
2025     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2026
2027     assert(is_ANYOF_SYNTHETIC(ssc));
2028
2029     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2030 }
2031
2032 PERL_STATIC_INLINE void
2033 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2034 {
2035     /* AND just the single code point 'cp' into the SSC 'ssc' */
2036
2037     SV* cp_list = _new_invlist(2);
2038
2039     PERL_ARGS_ASSERT_SSC_CP_AND;
2040
2041     assert(is_ANYOF_SYNTHETIC(ssc));
2042
2043     cp_list = add_cp_to_invlist(cp_list, cp);
2044     ssc_intersection(ssc, cp_list,
2045                      FALSE /* Not inverted */
2046                      );
2047     SvREFCNT_dec_NN(cp_list);
2048 }
2049
2050 PERL_STATIC_INLINE void
2051 S_ssc_clear_locale(regnode_ssc *ssc)
2052 {
2053     /* Set the SSC 'ssc' to not match any locale things */
2054     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2055
2056     assert(is_ANYOF_SYNTHETIC(ssc));
2057
2058     ANYOF_POSIXL_ZERO(ssc);
2059     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2060 }
2061
2062 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2063
2064 STATIC bool
2065 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2066 {
2067     /* The synthetic start class is used to hopefully quickly winnow down
2068      * places where a pattern could start a match in the target string.  If it
2069      * doesn't really narrow things down that much, there isn't much point to
2070      * having the overhead of using it.  This function uses some very crude
2071      * heuristics to decide if to use the ssc or not.
2072      *
2073      * It returns TRUE if 'ssc' rules out more than half what it considers to
2074      * be the "likely" possible matches, but of course it doesn't know what the
2075      * actual things being matched are going to be; these are only guesses
2076      *
2077      * For /l matches, it assumes that the only likely matches are going to be
2078      *      in the 0-255 range, uniformly distributed, so half of that is 127
2079      * For /a and /d matches, it assumes that the likely matches will be just
2080      *      the ASCII range, so half of that is 63
2081      * For /u and there isn't anything matching above the Latin1 range, it
2082      *      assumes that that is the only range likely to be matched, and uses
2083      *      half that as the cut-off: 127.  If anything matches above Latin1,
2084      *      it assumes that all of Unicode could match (uniformly), except for
2085      *      non-Unicode code points and things in the General Category "Other"
2086      *      (unassigned, private use, surrogates, controls and formats).  This
2087      *      is a much large number. */
2088
2089     U32 count = 0;      /* Running total of number of code points matched by
2090                            'ssc' */
2091     UV start, end;      /* Start and end points of current range in inversion
2092                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2093     const U32 max_code_points = (LOC)
2094                                 ?  256
2095                                 : ((  ! UNI_SEMANTICS
2096                                     ||  invlist_highest(ssc->invlist) < 256)
2097                                   ? 128
2098                                   : NON_OTHER_COUNT);
2099     const U32 max_match = max_code_points / 2;
2100
2101     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2102
2103     invlist_iterinit(ssc->invlist);
2104     while (invlist_iternext(ssc->invlist, &start, &end)) {
2105         if (start >= max_code_points) {
2106             break;
2107         }
2108         end = MIN(end, max_code_points - 1);
2109         count += end - start + 1;
2110         if (count >= max_match) {
2111             invlist_iterfinish(ssc->invlist);
2112             return FALSE;
2113         }
2114     }
2115
2116     return TRUE;
2117 }
2118
2119
2120 STATIC void
2121 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2122 {
2123     /* The inversion list in the SSC is marked mortal; now we need a more
2124      * permanent copy, which is stored the same way that is done in a regular
2125      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2126      * map */
2127
2128     SV* invlist = invlist_clone(ssc->invlist, NULL);
2129
2130     PERL_ARGS_ASSERT_SSC_FINALIZE;
2131
2132     assert(is_ANYOF_SYNTHETIC(ssc));
2133
2134     /* The code in this file assumes that all but these flags aren't relevant
2135      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2136      * by the time we reach here */
2137     assert(! (ANYOF_FLAGS(ssc)
2138         & ~( ANYOF_COMMON_FLAGS
2139             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2140             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2141
2142     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2143
2144     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2145     SvREFCNT_dec(invlist);
2146
2147     /* Make sure is clone-safe */
2148     ssc->invlist = NULL;
2149
2150     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2151         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2152         OP(ssc) = ANYOFPOSIXL;
2153     }
2154     else if (RExC_contains_locale) {
2155         OP(ssc) = ANYOFL;
2156     }
2157
2158     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2159 }
2160
2161 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2162 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2163 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2164 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2165                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2166                                : 0 )
2167
2168
2169 #ifdef DEBUGGING
2170 /*
2171    dump_trie(trie,widecharmap,revcharmap)
2172    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2173    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2174
2175    These routines dump out a trie in a somewhat readable format.
2176    The _interim_ variants are used for debugging the interim
2177    tables that are used to generate the final compressed
2178    representation which is what dump_trie expects.
2179
2180    Part of the reason for their existence is to provide a form
2181    of documentation as to how the different representations function.
2182
2183 */
2184
2185 /*
2186   Dumps the final compressed table form of the trie to Perl_debug_log.
2187   Used for debugging make_trie().
2188 */
2189
2190 STATIC void
2191 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2192             AV *revcharmap, U32 depth)
2193 {
2194     U32 state;
2195     SV *sv=sv_newmortal();
2196     int colwidth= widecharmap ? 6 : 4;
2197     U16 word;
2198     GET_RE_DEBUG_FLAGS_DECL;
2199
2200     PERL_ARGS_ASSERT_DUMP_TRIE;
2201
2202     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2203         depth+1, "Match","Base","Ofs" );
2204
2205     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2206         SV ** const tmp = av_fetch( revcharmap, state, 0);
2207         if ( tmp ) {
2208             Perl_re_printf( aTHX_  "%*s",
2209                 colwidth,
2210                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2211                             PL_colors[0], PL_colors[1],
2212                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2213                             PERL_PV_ESCAPE_FIRSTCHAR
2214                 )
2215             );
2216         }
2217     }
2218     Perl_re_printf( aTHX_  "\n");
2219     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2220
2221     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2222         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2223     Perl_re_printf( aTHX_  "\n");
2224
2225     for( state = 1 ; state < trie->statecount ; state++ ) {
2226         const U32 base = trie->states[ state ].trans.base;
2227
2228         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2229
2230         if ( trie->states[ state ].wordnum ) {
2231             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2232         } else {
2233             Perl_re_printf( aTHX_  "%6s", "" );
2234         }
2235
2236         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2237
2238         if ( base ) {
2239             U32 ofs = 0;
2240
2241             while( ( base + ofs  < trie->uniquecharcount ) ||
2242                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2243                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2244                                                                     != state))
2245                     ofs++;
2246
2247             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2248
2249             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2250                 if ( ( base + ofs >= trie->uniquecharcount )
2251                         && ( base + ofs - trie->uniquecharcount
2252                                                         < trie->lasttrans )
2253                         && trie->trans[ base + ofs
2254                                     - trie->uniquecharcount ].check == state )
2255                 {
2256                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2257                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2258                    );
2259                 } else {
2260                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2261                 }
2262             }
2263
2264             Perl_re_printf( aTHX_  "]");
2265
2266         }
2267         Perl_re_printf( aTHX_  "\n" );
2268     }
2269     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2270                                 depth);
2271     for (word=1; word <= trie->wordcount; word++) {
2272         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2273             (int)word, (int)(trie->wordinfo[word].prev),
2274             (int)(trie->wordinfo[word].len));
2275     }
2276     Perl_re_printf( aTHX_  "\n" );
2277 }
2278 /*
2279   Dumps a fully constructed but uncompressed trie in list form.
2280   List tries normally only are used for construction when the number of
2281   possible chars (trie->uniquecharcount) is very high.
2282   Used for debugging make_trie().
2283 */
2284 STATIC void
2285 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2286                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2287                          U32 depth)
2288 {
2289     U32 state;
2290     SV *sv=sv_newmortal();
2291     int colwidth= widecharmap ? 6 : 4;
2292     GET_RE_DEBUG_FLAGS_DECL;
2293
2294     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2295
2296     /* print out the table precompression.  */
2297     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2298             depth+1 );
2299     Perl_re_indentf( aTHX_  "%s",
2300             depth+1, "------:-----+-----------------\n" );
2301
2302     for( state=1 ; state < next_alloc ; state ++ ) {
2303         U16 charid;
2304
2305         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2306             depth+1, (UV)state  );
2307         if ( ! trie->states[ state ].wordnum ) {
2308             Perl_re_printf( aTHX_  "%5s| ","");
2309         } else {
2310             Perl_re_printf( aTHX_  "W%4x| ",
2311                 trie->states[ state ].wordnum
2312             );
2313         }
2314         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2315             SV ** const tmp = av_fetch( revcharmap,
2316                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2317             if ( tmp ) {
2318                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2319                     colwidth,
2320                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2321                               colwidth,
2322                               PL_colors[0], PL_colors[1],
2323                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2324                               | PERL_PV_ESCAPE_FIRSTCHAR
2325                     ) ,
2326                     TRIE_LIST_ITEM(state, charid).forid,
2327                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2328                 );
2329                 if (!(charid % 10))
2330                     Perl_re_printf( aTHX_  "\n%*s| ",
2331                         (int)((depth * 2) + 14), "");
2332             }
2333         }
2334         Perl_re_printf( aTHX_  "\n");
2335     }
2336 }
2337
2338 /*
2339   Dumps a fully constructed but uncompressed trie in table form.
2340   This is the normal DFA style state transition table, with a few
2341   twists to facilitate compression later.
2342   Used for debugging make_trie().
2343 */
2344 STATIC void
2345 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2346                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2347                           U32 depth)
2348 {
2349     U32 state;
2350     U16 charid;
2351     SV *sv=sv_newmortal();
2352     int colwidth= widecharmap ? 6 : 4;
2353     GET_RE_DEBUG_FLAGS_DECL;
2354
2355     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2356
2357     /*
2358        print out the table precompression so that we can do a visual check
2359        that they are identical.
2360      */
2361
2362     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2363
2364     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2365         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2366         if ( tmp ) {
2367             Perl_re_printf( aTHX_  "%*s",
2368                 colwidth,
2369                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2370                             PL_colors[0], PL_colors[1],
2371                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2372                             PERL_PV_ESCAPE_FIRSTCHAR
2373                 )
2374             );
2375         }
2376     }
2377
2378     Perl_re_printf( aTHX_ "\n");
2379     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2380
2381     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2382         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2383     }
2384
2385     Perl_re_printf( aTHX_  "\n" );
2386
2387     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2388
2389         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2390             depth+1,
2391             (UV)TRIE_NODENUM( state ) );
2392
2393         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2394             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2395             if (v)
2396                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2397             else
2398                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2399         }
2400         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2401             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2402                                             (UV)trie->trans[ state ].check );
2403         } else {
2404             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2405                                             (UV)trie->trans[ state ].check,
2406             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2407         }
2408     }
2409 }
2410
2411 #endif
2412
2413
2414 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2415   startbranch: the first branch in the whole branch sequence
2416   first      : start branch of sequence of branch-exact nodes.
2417                May be the same as startbranch
2418   last       : Thing following the last branch.
2419                May be the same as tail.
2420   tail       : item following the branch sequence
2421   count      : words in the sequence
2422   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2423   depth      : indent depth
2424
2425 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2426
2427 A trie is an N'ary tree where the branches are determined by digital
2428 decomposition of the key. IE, at the root node you look up the 1st character and
2429 follow that branch repeat until you find the end of the branches. Nodes can be
2430 marked as "accepting" meaning they represent a complete word. Eg:
2431
2432   /he|she|his|hers/
2433
2434 would convert into the following structure. Numbers represent states, letters
2435 following numbers represent valid transitions on the letter from that state, if
2436 the number is in square brackets it represents an accepting state, otherwise it
2437 will be in parenthesis.
2438
2439       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2440       |    |
2441       |   (2)
2442       |    |
2443      (1)   +-i->(6)-+-s->[7]
2444       |
2445       +-s->(3)-+-h->(4)-+-e->[5]
2446
2447       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2448
2449 This shows that when matching against the string 'hers' we will begin at state 1
2450 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2451 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2452 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2453 single traverse. We store a mapping from accepting to state to which word was
2454 matched, and then when we have multiple possibilities we try to complete the
2455 rest of the regex in the order in which they occurred in the alternation.
2456
2457 The only prior NFA like behaviour that would be changed by the TRIE support is
2458 the silent ignoring of duplicate alternations which are of the form:
2459
2460  / (DUPE|DUPE) X? (?{ ... }) Y /x
2461
2462 Thus EVAL blocks following a trie may be called a different number of times with
2463 and without the optimisation. With the optimisations dupes will be silently
2464 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2465 the following demonstrates:
2466
2467  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2468
2469 which prints out 'word' three times, but
2470
2471  'words'=~/(word|word|word)(?{ print $1 })S/
2472
2473 which doesnt print it out at all. This is due to other optimisations kicking in.
2474
2475 Example of what happens on a structural level:
2476
2477 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2478
2479    1: CURLYM[1] {1,32767}(18)
2480    5:   BRANCH(8)
2481    6:     EXACT <ac>(16)
2482    8:   BRANCH(11)
2483    9:     EXACT <ad>(16)
2484   11:   BRANCH(14)
2485   12:     EXACT <ab>(16)
2486   16:   SUCCEED(0)
2487   17:   NOTHING(18)
2488   18: END(0)
2489
2490 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2491 and should turn into:
2492
2493    1: CURLYM[1] {1,32767}(18)
2494    5:   TRIE(16)
2495         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2496           <ac>
2497           <ad>
2498           <ab>
2499   16:   SUCCEED(0)
2500   17:   NOTHING(18)
2501   18: END(0)
2502
2503 Cases where tail != last would be like /(?foo|bar)baz/:
2504
2505    1: BRANCH(4)
2506    2:   EXACT <foo>(8)
2507    4: BRANCH(7)
2508    5:   EXACT <bar>(8)
2509    7: TAIL(8)
2510    8: EXACT <baz>(10)
2511   10: END(0)
2512
2513 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2514 and would end up looking like:
2515
2516     1: TRIE(8)
2517       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2518         <foo>
2519         <bar>
2520    7: TAIL(8)
2521    8: EXACT <baz>(10)
2522   10: END(0)
2523
2524     d = uvchr_to_utf8_flags(d, uv, 0);
2525
2526 is the recommended Unicode-aware way of saying
2527
2528     *(d++) = uv;
2529 */
2530
2531 #define TRIE_STORE_REVCHAR(val)                                            \
2532     STMT_START {                                                           \
2533         if (UTF) {                                                         \
2534             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2535             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2536             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2537             *kapow = '\0';                                                 \
2538             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2539             SvPOK_on(zlopp);                                               \
2540             SvUTF8_on(zlopp);                                              \
2541             av_push(revcharmap, zlopp);                                    \
2542         } else {                                                           \
2543             char ooooff = (char)val;                                           \
2544             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2545         }                                                                  \
2546         } STMT_END
2547
2548 /* This gets the next character from the input, folding it if not already
2549  * folded. */
2550 #define TRIE_READ_CHAR STMT_START {                                           \
2551     wordlen++;                                                                \
2552     if ( UTF ) {                                                              \
2553         /* if it is UTF then it is either already folded, or does not need    \
2554          * folding */                                                         \
2555         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2556     }                                                                         \
2557     else if (folder == PL_fold_latin1) {                                      \
2558         /* This folder implies Unicode rules, which in the range expressible  \
2559          *  by not UTF is the lower case, with the two exceptions, one of     \
2560          *  which should have been taken care of before calling this */       \
2561         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2562         uvc = toLOWER_L1(*uc);                                                \
2563         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2564         len = 1;                                                              \
2565     } else {                                                                  \
2566         /* raw data, will be folded later if needed */                        \
2567         uvc = (U32)*uc;                                                       \
2568         len = 1;                                                              \
2569     }                                                                         \
2570 } STMT_END
2571
2572
2573
2574 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2575     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2576         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2577         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2578         TRIE_LIST_LEN( state ) = ging;                          \
2579     }                                                           \
2580     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2581     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2582     TRIE_LIST_CUR( state )++;                                   \
2583 } STMT_END
2584
2585 #define TRIE_LIST_NEW(state) STMT_START {                       \
2586     Newx( trie->states[ state ].trans.list,                     \
2587         4, reg_trie_trans_le );                                 \
2588      TRIE_LIST_CUR( state ) = 1;                                \
2589      TRIE_LIST_LEN( state ) = 4;                                \
2590 } STMT_END
2591
2592 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2593     U16 dupe= trie->states[ state ].wordnum;                    \
2594     regnode * const noper_next = regnext( noper );              \
2595                                                                 \
2596     DEBUG_r({                                                   \
2597         /* store the word for dumping */                        \
2598         SV* tmp;                                                \
2599         if (OP(noper) != NOTHING)                               \
2600             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2601         else                                                    \
2602             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2603         av_push( trie_words, tmp );                             \
2604     });                                                         \
2605                                                                 \
2606     curword++;                                                  \
2607     trie->wordinfo[curword].prev   = 0;                         \
2608     trie->wordinfo[curword].len    = wordlen;                   \
2609     trie->wordinfo[curword].accept = state;                     \
2610                                                                 \
2611     if ( noper_next < tail ) {                                  \
2612         if (!trie->jump)                                        \
2613             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2614                                                  sizeof(U16) ); \
2615         trie->jump[curword] = (U16)(noper_next - convert);      \
2616         if (!jumper)                                            \
2617             jumper = noper_next;                                \
2618         if (!nextbranch)                                        \
2619             nextbranch= regnext(cur);                           \
2620     }                                                           \
2621                                                                 \
2622     if ( dupe ) {                                               \
2623         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2624         /* chain, so that when the bits of chain are later    */\
2625         /* linked together, the dups appear in the chain      */\
2626         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2627         trie->wordinfo[dupe].prev = curword;                    \
2628     } else {                                                    \
2629         /* we haven't inserted this word yet.                */ \
2630         trie->states[ state ].wordnum = curword;                \
2631     }                                                           \
2632 } STMT_END
2633
2634
2635 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2636      ( ( base + charid >=  ucharcount                                   \
2637          && base + charid < ubound                                      \
2638          && state == trie->trans[ base - ucharcount + charid ].check    \
2639          && trie->trans[ base - ucharcount + charid ].next )            \
2640            ? trie->trans[ base - ucharcount + charid ].next             \
2641            : ( state==1 ? special : 0 )                                 \
2642       )
2643
2644 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2645 STMT_START {                                                \
2646     TRIE_BITMAP_SET(trie, uvc);                             \
2647     /* store the folded codepoint */                        \
2648     if ( folder )                                           \
2649         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2650                                                             \
2651     if ( !UTF ) {                                           \
2652         /* store first byte of utf8 representation of */    \
2653         /* variant codepoints */                            \
2654         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2655             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2656         }                                                   \
2657     }                                                       \
2658 } STMT_END
2659 #define MADE_TRIE       1
2660 #define MADE_JUMP_TRIE  2
2661 #define MADE_EXACT_TRIE 4
2662
2663 STATIC I32
2664 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2665                   regnode *first, regnode *last, regnode *tail,
2666                   U32 word_count, U32 flags, U32 depth)
2667 {
2668     /* first pass, loop through and scan words */
2669     reg_trie_data *trie;
2670     HV *widecharmap = NULL;
2671     AV *revcharmap = newAV();
2672     regnode *cur;
2673     STRLEN len = 0;
2674     UV uvc = 0;
2675     U16 curword = 0;
2676     U32 next_alloc = 0;
2677     regnode *jumper = NULL;
2678     regnode *nextbranch = NULL;
2679     regnode *convert = NULL;
2680     U32 *prev_states; /* temp array mapping each state to previous one */
2681     /* we just use folder as a flag in utf8 */
2682     const U8 * folder = NULL;
2683
2684     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2685      * which stands for one trie structure, one hash, optionally followed
2686      * by two arrays */
2687 #ifdef DEBUGGING
2688     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2689     AV *trie_words = NULL;
2690     /* along with revcharmap, this only used during construction but both are
2691      * useful during debugging so we store them in the struct when debugging.
2692      */
2693 #else
2694     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2695     STRLEN trie_charcount=0;
2696 #endif
2697     SV *re_trie_maxbuff;
2698     GET_RE_DEBUG_FLAGS_DECL;
2699
2700     PERL_ARGS_ASSERT_MAKE_TRIE;
2701 #ifndef DEBUGGING
2702     PERL_UNUSED_ARG(depth);
2703 #endif
2704
2705     switch (flags) {
2706         case EXACT: case EXACT_REQ8: case EXACTL: break;
2707         case EXACTFAA:
2708         case EXACTFUP:
2709         case EXACTFU:
2710         case EXACTFLU8: folder = PL_fold_latin1; break;
2711         case EXACTF:  folder = PL_fold; break;
2712         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2713     }
2714
2715     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2716     trie->refcount = 1;
2717     trie->startstate = 1;
2718     trie->wordcount = word_count;
2719     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2720     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2721     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2722         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2723     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2724                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2725
2726     DEBUG_r({
2727         trie_words = newAV();
2728     });
2729
2730     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2731     assert(re_trie_maxbuff);
2732     if (!SvIOK(re_trie_maxbuff)) {
2733         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2734     }
2735     DEBUG_TRIE_COMPILE_r({
2736         Perl_re_indentf( aTHX_
2737           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2738           depth+1,
2739           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2740           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2741     });
2742
2743    /* Find the node we are going to overwrite */
2744     if ( first == startbranch && OP( last ) != BRANCH ) {
2745         /* whole branch chain */
2746         convert = first;
2747     } else {
2748         /* branch sub-chain */
2749         convert = NEXTOPER( first );
2750     }
2751
2752     /*  -- First loop and Setup --
2753
2754        We first traverse the branches and scan each word to determine if it
2755        contains widechars, and how many unique chars there are, this is
2756        important as we have to build a table with at least as many columns as we
2757        have unique chars.
2758
2759        We use an array of integers to represent the character codes 0..255
2760        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2761        the native representation of the character value as the key and IV's for
2762        the coded index.
2763
2764        *TODO* If we keep track of how many times each character is used we can
2765        remap the columns so that the table compression later on is more
2766        efficient in terms of memory by ensuring the most common value is in the
2767        middle and the least common are on the outside.  IMO this would be better
2768        than a most to least common mapping as theres a decent chance the most
2769        common letter will share a node with the least common, meaning the node
2770        will not be compressible. With a middle is most common approach the worst
2771        case is when we have the least common nodes twice.
2772
2773      */
2774
2775     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2776         regnode *noper = NEXTOPER( cur );
2777         const U8 *uc;
2778         const U8 *e;
2779         int foldlen = 0;
2780         U32 wordlen      = 0;         /* required init */
2781         STRLEN minchars = 0;
2782         STRLEN maxchars = 0;
2783         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2784                                                bitmap?*/
2785
2786         if (OP(noper) == NOTHING) {
2787             /* skip past a NOTHING at the start of an alternation
2788              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2789              */
2790             regnode *noper_next= regnext(noper);
2791             if (noper_next < tail)
2792                 noper= noper_next;
2793         }
2794
2795         if (    noper < tail
2796             && (    OP(noper) == flags
2797                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2798                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
2799                                          || OP(noper) == EXACTFUP))))
2800         {
2801             uc= (U8*)STRING(noper);
2802             e= uc + STR_LEN(noper);
2803         } else {
2804             trie->minlen= 0;
2805             continue;
2806         }
2807
2808
2809         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2810             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2811                                           regardless of encoding */
2812             if (OP( noper ) == EXACTFUP) {
2813                 /* false positives are ok, so just set this */
2814                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2815             }
2816         }
2817
2818         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2819                                            branch */
2820             TRIE_CHARCOUNT(trie)++;
2821             TRIE_READ_CHAR;
2822
2823             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2824              * is in effect.  Under /i, this character can match itself, or
2825              * anything that folds to it.  If not under /i, it can match just
2826              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2827              * all fold to k, and all are single characters.   But some folds
2828              * expand to more than one character, so for example LATIN SMALL
2829              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2830              * the string beginning at 'uc' is 'ffi', it could be matched by
2831              * three characters, or just by the one ligature character. (It
2832              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2833              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2834              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2835              * match.)  The trie needs to know the minimum and maximum number
2836              * of characters that could match so that it can use size alone to
2837              * quickly reject many match attempts.  The max is simple: it is
2838              * the number of folded characters in this branch (since a fold is
2839              * never shorter than what folds to it. */
2840
2841             maxchars++;
2842
2843             /* And the min is equal to the max if not under /i (indicated by
2844              * 'folder' being NULL), or there are no multi-character folds.  If
2845              * there is a multi-character fold, the min is incremented just
2846              * once, for the character that folds to the sequence.  Each
2847              * character in the sequence needs to be added to the list below of
2848              * characters in the trie, but we count only the first towards the
2849              * min number of characters needed.  This is done through the
2850              * variable 'foldlen', which is returned by the macros that look
2851              * for these sequences as the number of bytes the sequence
2852              * occupies.  Each time through the loop, we decrement 'foldlen' by
2853              * how many bytes the current char occupies.  Only when it reaches
2854              * 0 do we increment 'minchars' or look for another multi-character
2855              * sequence. */
2856             if (folder == NULL) {
2857                 minchars++;
2858             }
2859             else if (foldlen > 0) {
2860                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2861             }
2862             else {
2863                 minchars++;
2864
2865                 /* See if *uc is the beginning of a multi-character fold.  If
2866                  * so, we decrement the length remaining to look at, to account
2867                  * for the current character this iteration.  (We can use 'uc'
2868                  * instead of the fold returned by TRIE_READ_CHAR because for
2869                  * non-UTF, the latin1_safe macro is smart enough to account
2870                  * for all the unfolded characters, and because for UTF, the
2871                  * string will already have been folded earlier in the
2872                  * compilation process */
2873                 if (UTF) {
2874                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2875                         foldlen -= UTF8SKIP(uc);
2876                     }
2877                 }
2878                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2879                     foldlen--;
2880                 }
2881             }
2882
2883             /* The current character (and any potential folds) should be added
2884              * to the possible matching characters for this position in this
2885              * branch */
2886             if ( uvc < 256 ) {
2887                 if ( folder ) {
2888                     U8 folded= folder[ (U8) uvc ];
2889                     if ( !trie->charmap[ folded ] ) {
2890                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2891                         TRIE_STORE_REVCHAR( folded );
2892                     }
2893                 }
2894                 if ( !trie->charmap[ uvc ] ) {
2895                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2896                     TRIE_STORE_REVCHAR( uvc );
2897                 }
2898                 if ( set_bit ) {
2899                     /* store the codepoint in the bitmap, and its folded
2900                      * equivalent. */
2901                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2902                     set_bit = 0; /* We've done our bit :-) */
2903                 }
2904             } else {
2905
2906                 /* XXX We could come up with the list of code points that fold
2907                  * to this using PL_utf8_foldclosures, except not for
2908                  * multi-char folds, as there may be multiple combinations
2909                  * there that could work, which needs to wait until runtime to
2910                  * resolve (The comment about LIGATURE FFI above is such an
2911                  * example */
2912
2913                 SV** svpp;
2914                 if ( !widecharmap )
2915                     widecharmap = newHV();
2916
2917                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2918
2919                 if ( !svpp )
2920                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2921
2922                 if ( !SvTRUE( *svpp ) ) {
2923                     sv_setiv( *svpp, ++trie->uniquecharcount );
2924                     TRIE_STORE_REVCHAR(uvc);
2925                 }
2926             }
2927         } /* end loop through characters in this branch of the trie */
2928
2929         /* We take the min and max for this branch and combine to find the min
2930          * and max for all branches processed so far */
2931         if( cur == first ) {
2932             trie->minlen = minchars;
2933             trie->maxlen = maxchars;
2934         } else if (minchars < trie->minlen) {
2935             trie->minlen = minchars;
2936         } else if (maxchars > trie->maxlen) {
2937             trie->maxlen = maxchars;
2938         }
2939     } /* end first pass */
2940     DEBUG_TRIE_COMPILE_r(
2941         Perl_re_indentf( aTHX_
2942                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2943                 depth+1,
2944                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2945                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2946                 (int)trie->minlen, (int)trie->maxlen )
2947     );
2948
2949     /*
2950         We now know what we are dealing with in terms of unique chars and
2951         string sizes so we can calculate how much memory a naive
2952         representation using a flat table  will take. If it's over a reasonable
2953         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2954         conservative but potentially much slower representation using an array
2955         of lists.
2956
2957         At the end we convert both representations into the same compressed
2958         form that will be used in regexec.c for matching with. The latter
2959         is a form that cannot be used to construct with but has memory
2960         properties similar to the list form and access properties similar
2961         to the table form making it both suitable for fast searches and
2962         small enough that its feasable to store for the duration of a program.
2963
2964         See the comment in the code where the compressed table is produced
2965         inplace from the flat tabe representation for an explanation of how
2966         the compression works.
2967
2968     */
2969
2970
2971     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2972     prev_states[1] = 0;
2973
2974     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2975                                                     > SvIV(re_trie_maxbuff) )
2976     {
2977         /*
2978             Second Pass -- Array Of Lists Representation
2979
2980             Each state will be represented by a list of charid:state records
2981             (reg_trie_trans_le) the first such element holds the CUR and LEN
2982             points of the allocated array. (See defines above).
2983
2984             We build the initial structure using the lists, and then convert
2985             it into the compressed table form which allows faster lookups
2986             (but cant be modified once converted).
2987         */
2988
2989         STRLEN transcount = 1;
2990
2991         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2992             depth+1));
2993
2994         trie->states = (reg_trie_state *)
2995             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2996                                   sizeof(reg_trie_state) );
2997         TRIE_LIST_NEW(1);
2998         next_alloc = 2;
2999
3000         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3001
3002             regnode *noper   = NEXTOPER( cur );
3003             U32 state        = 1;         /* required init */
3004             U16 charid       = 0;         /* sanity init */
3005             U32 wordlen      = 0;         /* required init */
3006
3007             if (OP(noper) == NOTHING) {
3008                 regnode *noper_next= regnext(noper);
3009                 if (noper_next < tail)
3010                     noper= noper_next;
3011             }
3012
3013             if (    noper < tail
3014                 && (    OP(noper) == flags
3015                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3016                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3017                                              || OP(noper) == EXACTFUP))))
3018             {
3019                 const U8 *uc= (U8*)STRING(noper);
3020                 const U8 *e= uc + STR_LEN(noper);
3021
3022                 for ( ; uc < e ; uc += len ) {
3023
3024                     TRIE_READ_CHAR;
3025
3026                     if ( uvc < 256 ) {
3027                         charid = trie->charmap[ uvc ];
3028                     } else {
3029                         SV** const svpp = hv_fetch( widecharmap,
3030                                                     (char*)&uvc,
3031                                                     sizeof( UV ),
3032                                                     0);
3033                         if ( !svpp ) {
3034                             charid = 0;
3035                         } else {
3036                             charid=(U16)SvIV( *svpp );
3037                         }
3038                     }
3039                     /* charid is now 0 if we dont know the char read, or
3040                      * nonzero if we do */
3041                     if ( charid ) {
3042
3043                         U16 check;
3044                         U32 newstate = 0;
3045
3046                         charid--;
3047                         if ( !trie->states[ state ].trans.list ) {
3048                             TRIE_LIST_NEW( state );
3049                         }
3050                         for ( check = 1;
3051                               check <= TRIE_LIST_USED( state );
3052                               check++ )
3053                         {
3054                             if ( TRIE_LIST_ITEM( state, check ).forid
3055                                                                     == charid )
3056                             {
3057                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3058                                 break;
3059                             }
3060                         }
3061                         if ( ! newstate ) {
3062                             newstate = next_alloc++;
3063                             prev_states[newstate] = state;
3064                             TRIE_LIST_PUSH( state, charid, newstate );
3065                             transcount++;
3066                         }
3067                         state = newstate;
3068                     } else {
3069                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3070                     }
3071                 }
3072             }
3073             TRIE_HANDLE_WORD(state);
3074
3075         } /* end second pass */
3076
3077         /* next alloc is the NEXT state to be allocated */
3078         trie->statecount = next_alloc;
3079         trie->states = (reg_trie_state *)
3080             PerlMemShared_realloc( trie->states,
3081                                    next_alloc
3082                                    * sizeof(reg_trie_state) );
3083
3084         /* and now dump it out before we compress it */
3085         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3086                                                          revcharmap, next_alloc,
3087                                                          depth+1)
3088         );
3089
3090         trie->trans = (reg_trie_trans *)
3091             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3092         {
3093             U32 state;
3094             U32 tp = 0;
3095             U32 zp = 0;
3096
3097
3098             for( state=1 ; state < next_alloc ; state ++ ) {
3099                 U32 base=0;
3100
3101                 /*
3102                 DEBUG_TRIE_COMPILE_MORE_r(
3103                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3104                 );
3105                 */
3106
3107                 if (trie->states[state].trans.list) {
3108                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3109                     U16 maxid=minid;
3110                     U16 idx;
3111
3112                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3113                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3114                         if ( forid < minid ) {
3115                             minid=forid;
3116                         } else if ( forid > maxid ) {
3117                             maxid=forid;
3118                         }
3119                     }
3120                     if ( transcount < tp + maxid - minid + 1) {
3121                         transcount *= 2;
3122                         trie->trans = (reg_trie_trans *)
3123                             PerlMemShared_realloc( trie->trans,
3124                                                      transcount
3125                                                      * sizeof(reg_trie_trans) );
3126                         Zero( trie->trans + (transcount / 2),
3127                               transcount / 2,
3128                               reg_trie_trans );
3129                     }
3130                     base = trie->uniquecharcount + tp - minid;
3131                     if ( maxid == minid ) {
3132                         U32 set = 0;
3133                         for ( ; zp < tp ; zp++ ) {
3134                             if ( ! trie->trans[ zp ].next ) {
3135                                 base = trie->uniquecharcount + zp - minid;
3136                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3137                                                                    1).newstate;
3138                                 trie->trans[ zp ].check = state;
3139                                 set = 1;
3140                                 break;
3141                             }
3142                         }
3143                         if ( !set ) {
3144                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3145                                                                    1).newstate;
3146                             trie->trans[ tp ].check = state;
3147                             tp++;
3148                             zp = tp;
3149                         }
3150                     } else {
3151                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3152                             const U32 tid = base
3153                                            - trie->uniquecharcount
3154                                            + TRIE_LIST_ITEM( state, idx ).forid;
3155                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3156                                                                 idx ).newstate;
3157                             trie->trans[ tid ].check = state;
3158                         }
3159                         tp += ( maxid - minid + 1 );
3160                     }
3161                     Safefree(trie->states[ state ].trans.list);
3162                 }
3163                 /*
3164                 DEBUG_TRIE_COMPILE_MORE_r(
3165                     Perl_re_printf( aTHX_  " base: %d\n",base);
3166                 );
3167                 */
3168                 trie->states[ state ].trans.base=base;
3169             }
3170             trie->lasttrans = tp + 1;
3171         }
3172     } else {
3173         /*
3174            Second Pass -- Flat Table Representation.
3175
3176            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3177            each.  We know that we will need Charcount+1 trans at most to store
3178            the data (one row per char at worst case) So we preallocate both
3179            structures assuming worst case.
3180
3181            We then construct the trie using only the .next slots of the entry
3182            structs.
3183
3184            We use the .check field of the first entry of the node temporarily
3185            to make compression both faster and easier by keeping track of how
3186            many non zero fields are in the node.
3187
3188            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3189            transition.
3190
3191            There are two terms at use here: state as a TRIE_NODEIDX() which is
3192            a number representing the first entry of the node, and state as a
3193            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3194            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3195            if there are 2 entrys per node. eg:
3196
3197              A B       A B
3198           1. 2 4    1. 3 7
3199           2. 0 3    3. 0 5
3200           3. 0 0    5. 0 0
3201           4. 0 0    7. 0 0
3202
3203            The table is internally in the right hand, idx form. However as we
3204            also have to deal with the states array which is indexed by nodenum
3205            we have to use TRIE_NODENUM() to convert.
3206
3207         */
3208         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3209             depth+1));
3210
3211         trie->trans = (reg_trie_trans *)
3212             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3213                                   * trie->uniquecharcount + 1,
3214                                   sizeof(reg_trie_trans) );
3215         trie->states = (reg_trie_state *)
3216             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3217                                   sizeof(reg_trie_state) );
3218         next_alloc = trie->uniquecharcount + 1;
3219
3220
3221         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3222
3223             regnode *noper   = NEXTOPER( cur );
3224
3225             U32 state        = 1;         /* required init */
3226
3227             U16 charid       = 0;         /* sanity init */
3228             U32 accept_state = 0;         /* sanity init */
3229
3230             U32 wordlen      = 0;         /* required init */
3231
3232             if (OP(noper) == NOTHING) {
3233                 regnode *noper_next= regnext(noper);
3234                 if (noper_next < tail)
3235                     noper= noper_next;
3236             }
3237
3238             if (    noper < tail
3239                 && (    OP(noper) == flags
3240                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3241                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3242                                              || OP(noper) == EXACTFUP))))
3243             {
3244                 const U8 *uc= (U8*)STRING(noper);
3245                 const U8 *e= uc + STR_LEN(noper);
3246
3247                 for ( ; uc < e ; uc += len ) {
3248
3249                     TRIE_READ_CHAR;
3250
3251                     if ( uvc < 256 ) {
3252                         charid = trie->charmap[ uvc ];
3253                     } else {
3254                         SV* const * const svpp = hv_fetch( widecharmap,
3255                                                            (char*)&uvc,
3256                                                            sizeof( UV ),
3257                                                            0);
3258                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3259                     }
3260                     if ( charid ) {
3261                         charid--;
3262                         if ( !trie->trans[ state + charid ].next ) {
3263                             trie->trans[ state + charid ].next = next_alloc;
3264                             trie->trans[ state ].check++;
3265                             prev_states[TRIE_NODENUM(next_alloc)]
3266                                     = TRIE_NODENUM(state);
3267                             next_alloc += trie->uniquecharcount;
3268                         }
3269                         state = trie->trans[ state + charid ].next;
3270                     } else {
3271                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3272                     }
3273                     /* charid is now 0 if we dont know the char read, or
3274                      * nonzero if we do */
3275                 }
3276             }
3277             accept_state = TRIE_NODENUM( state );
3278             TRIE_HANDLE_WORD(accept_state);
3279
3280         } /* end second pass */
3281
3282         /* and now dump it out before we compress it */
3283         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3284                                                           revcharmap,
3285                                                           next_alloc, depth+1));
3286
3287         {
3288         /*
3289            * Inplace compress the table.*
3290
3291            For sparse data sets the table constructed by the trie algorithm will
3292            be mostly 0/FAIL transitions or to put it another way mostly empty.
3293            (Note that leaf nodes will not contain any transitions.)
3294
3295            This algorithm compresses the tables by eliminating most such
3296            transitions, at the cost of a modest bit of extra work during lookup:
3297
3298            - Each states[] entry contains a .base field which indicates the
3299            index in the state[] array wheres its transition data is stored.
3300
3301            - If .base is 0 there are no valid transitions from that node.
3302
3303            - If .base is nonzero then charid is added to it to find an entry in
3304            the trans array.
3305
3306            -If trans[states[state].base+charid].check!=state then the
3307            transition is taken to be a 0/Fail transition. Thus if there are fail
3308            transitions at the front of the node then the .base offset will point
3309            somewhere inside the previous nodes data (or maybe even into a node
3310            even earlier), but the .check field determines if the transition is
3311            valid.
3312
3313            XXX - wrong maybe?
3314            The following process inplace converts the table to the compressed
3315            table: We first do not compress the root node 1,and mark all its
3316            .check pointers as 1 and set its .base pointer as 1 as well. This
3317            allows us to do a DFA construction from the compressed table later,
3318            and ensures that any .base pointers we calculate later are greater
3319            than 0.
3320
3321            - We set 'pos' to indicate the first entry of the second node.
3322
3323            - We then iterate over the columns of the node, finding the first and
3324            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3325            and set the .check pointers accordingly, and advance pos
3326            appropriately and repreat for the next node. Note that when we copy
3327            the next pointers we have to convert them from the original
3328            NODEIDX form to NODENUM form as the former is not valid post
3329            compression.
3330
3331            - If a node has no transitions used we mark its base as 0 and do not
3332            advance the pos pointer.
3333
3334            - If a node only has one transition we use a second pointer into the
3335            structure to fill in allocated fail transitions from other states.
3336            This pointer is independent of the main pointer and scans forward
3337            looking for null transitions that are allocated to a state. When it
3338            finds one it writes the single transition into the "hole".  If the
3339            pointer doesnt find one the single transition is appended as normal.
3340
3341            - Once compressed we can Renew/realloc the structures to release the
3342            excess space.
3343
3344            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3345            specifically Fig 3.47 and the associated pseudocode.
3346
3347            demq
3348         */
3349         const U32 laststate = TRIE_NODENUM( next_alloc );
3350         U32 state, charid;
3351         U32 pos = 0, zp=0;
3352         trie->statecount = laststate;
3353
3354         for ( state = 1 ; state < laststate ; state++ ) {
3355             U8 flag = 0;
3356             const U32 stateidx = TRIE_NODEIDX( state );
3357             const U32 o_used = trie->trans[ stateidx ].check;
3358             U32 used = trie->trans[ stateidx ].check;
3359             trie->trans[ stateidx ].check = 0;
3360
3361             for ( charid = 0;
3362                   used && charid < trie->uniquecharcount;
3363                   charid++ )
3364             {
3365                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3366                     if ( trie->trans[ stateidx + charid ].next ) {
3367                         if (o_used == 1) {
3368                             for ( ; zp < pos ; zp++ ) {
3369                                 if ( ! trie->trans[ zp ].next ) {
3370                                     break;
3371                                 }
3372                             }
3373                             trie->states[ state ].trans.base
3374                                                     = zp
3375                                                       + trie->uniquecharcount
3376                                                       - charid ;
3377                             trie->trans[ zp ].next
3378                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3379                                                              + charid ].next );
3380                             trie->trans[ zp ].check = state;
3381                             if ( ++zp > pos ) pos = zp;
3382                             break;
3383                         }
3384                         used--;
3385                     }
3386                     if ( !flag ) {
3387                         flag = 1;
3388                         trie->states[ state ].trans.base
3389                                        = pos + trie->uniquecharcount - charid ;
3390                     }
3391                     trie->trans[ pos ].next
3392                         = SAFE_TRIE_NODENUM(
3393                                        trie->trans[ stateidx + charid ].next );
3394                     trie->trans[ pos ].check = state;
3395                     pos++;
3396                 }
3397             }
3398         }
3399         trie->lasttrans = pos + 1;
3400         trie->states = (reg_trie_state *)
3401             PerlMemShared_realloc( trie->states, laststate
3402                                    * sizeof(reg_trie_state) );
3403         DEBUG_TRIE_COMPILE_MORE_r(
3404             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3405                 depth+1,
3406                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3407                        + 1 ),
3408                 (IV)next_alloc,
3409                 (IV)pos,
3410                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3411             );
3412
3413         } /* end table compress */
3414     }
3415     DEBUG_TRIE_COMPILE_MORE_r(
3416             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3417                 depth+1,
3418                 (UV)trie->statecount,
3419                 (UV)trie->lasttrans)
3420     );
3421     /* resize the trans array to remove unused space */
3422     trie->trans = (reg_trie_trans *)
3423         PerlMemShared_realloc( trie->trans, trie->lasttrans
3424                                * sizeof(reg_trie_trans) );
3425
3426     {   /* Modify the program and insert the new TRIE node */
3427         U8 nodetype =(U8)(flags & 0xFF);
3428         char *str=NULL;
3429
3430 #ifdef DEBUGGING
3431         regnode *optimize = NULL;
3432 #ifdef RE_TRACK_PATTERN_OFFSETS
3433
3434         U32 mjd_offset = 0;
3435         U32 mjd_nodelen = 0;
3436 #endif /* RE_TRACK_PATTERN_OFFSETS */
3437 #endif /* DEBUGGING */
3438         /*
3439            This means we convert either the first branch or the first Exact,
3440            depending on whether the thing following (in 'last') is a branch
3441            or not and whther first is the startbranch (ie is it a sub part of
3442            the alternation or is it the whole thing.)
3443            Assuming its a sub part we convert the EXACT otherwise we convert
3444            the whole branch sequence, including the first.
3445          */
3446         /* Find the node we are going to overwrite */
3447         if ( first != startbranch || OP( last ) == BRANCH ) {
3448             /* branch sub-chain */
3449             NEXT_OFF( first ) = (U16)(last - first);
3450 #ifdef RE_TRACK_PATTERN_OFFSETS
3451             DEBUG_r({
3452                 mjd_offset= Node_Offset((convert));
3453                 mjd_nodelen= Node_Length((convert));
3454             });
3455 #endif
3456             /* whole branch chain */
3457         }
3458 #ifdef RE_TRACK_PATTERN_OFFSETS
3459         else {
3460             DEBUG_r({
3461                 const  regnode *nop = NEXTOPER( convert );
3462                 mjd_offset= Node_Offset((nop));
3463                 mjd_nodelen= Node_Length((nop));
3464             });
3465         }
3466         DEBUG_OPTIMISE_r(
3467             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3468                 depth+1,
3469                 (UV)mjd_offset, (UV)mjd_nodelen)
3470         );
3471 #endif
3472         /* But first we check to see if there is a common prefix we can
3473            split out as an EXACT and put in front of the TRIE node.  */
3474         trie->startstate= 1;
3475         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3476             /* we want to find the first state that has more than
3477              * one transition, if that state is not the first state
3478              * then we have a common prefix which we can remove.
3479              */
3480             U32 state;
3481             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3482                 U32 ofs = 0;
3483                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3484                                        transition, -1 means none */
3485                 U32 count = 0;
3486                 const U32 base = trie->states[ state ].trans.base;
3487
3488                 /* does this state terminate an alternation? */
3489                 if ( trie->states[state].wordnum )
3490                         count = 1;
3491
3492                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3493                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3494                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3495                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3496                     {
3497                         if ( ++count > 1 ) {
3498                             /* we have more than one transition */
3499                             SV **tmp;
3500                             U8 *ch;
3501                             /* if this is the first state there is no common prefix
3502                              * to extract, so we can exit */
3503                             if ( state == 1 ) break;
3504                             tmp = av_fetch( revcharmap, ofs, 0);
3505                             ch = (U8*)SvPV_nolen_const( *tmp );
3506
3507                             /* if we are on count 2 then we need to initialize the
3508                              * bitmap, and store the previous char if there was one
3509                              * in it*/
3510                             if ( count == 2 ) {
3511                                 /* clear the bitmap */
3512                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3513                                 DEBUG_OPTIMISE_r(
3514                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3515                                         depth+1,
3516                                         (UV)state));
3517                                 if (first_ofs >= 0) {
3518                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3519                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3520
3521                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3522                                     DEBUG_OPTIMISE_r(
3523                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3524                                     );
3525                                 }
3526                             }
3527                             /* store the current firstchar in the bitmap */
3528                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3529                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3530                         }
3531                         first_ofs = ofs;
3532                     }
3533                 }
3534                 if ( count == 1 ) {
3535                     /* This state has only one transition, its transition is part
3536                      * of a common prefix - we need to concatenate the char it
3537                      * represents to what we have so far. */
3538                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3539                     STRLEN len;
3540                     char *ch = SvPV( *tmp, len );
3541                     DEBUG_OPTIMISE_r({
3542                         SV *sv=sv_newmortal();
3543                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3544                             depth+1,
3545                             (UV)state, (UV)first_ofs,
3546                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3547                                 PL_colors[0], PL_colors[1],
3548                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3549                                 PERL_PV_ESCAPE_FIRSTCHAR
3550                             )
3551                         );
3552                     });
3553                     if ( state==1 ) {
3554                         OP( convert ) = nodetype;
3555                         str=STRING(convert);
3556                         setSTR_LEN(convert, 0);
3557                     }
3558                     setSTR_LEN(convert, STR_LEN(convert) + len);
3559                     while (len--)
3560                         *str++ = *ch++;
3561                 } else {
3562 #ifdef DEBUGGING
3563                     if (state>1)
3564                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3565 #endif
3566                     break;
3567                 }
3568             }
3569             trie->prefixlen = (state-1);
3570             if (str) {
3571                 regnode *n = convert+NODE_SZ_STR(convert);
3572                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3573                 trie->startstate = state;
3574                 trie->minlen -= (state - 1);
3575                 trie->maxlen -= (state - 1);
3576 #ifdef DEBUGGING
3577                /* At least the UNICOS C compiler choked on this
3578                 * being argument to DEBUG_r(), so let's just have
3579                 * it right here. */
3580                if (
3581 #ifdef PERL_EXT_RE_BUILD
3582                    1
3583 #else
3584                    DEBUG_r_TEST
3585 #endif
3586                    ) {
3587                    regnode *fix = convert;
3588                    U32 word = trie->wordcount;
3589 #ifdef RE_TRACK_PATTERN_OFFSETS
3590                    mjd_nodelen++;
3591 #endif
3592                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3593                    while( ++fix < n ) {
3594                        Set_Node_Offset_Length(fix, 0, 0);
3595                    }
3596                    while (word--) {
3597                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3598                        if (tmp) {
3599                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3600                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3601                            else
3602                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3603                        }
3604                    }
3605                }
3606 #endif
3607                 if (trie->maxlen) {
3608                     convert = n;
3609                 } else {
3610                     NEXT_OFF(convert) = (U16)(tail - convert);
3611                     DEBUG_r(optimize= n);
3612                 }
3613             }
3614         }
3615         if (!jumper)
3616             jumper = last;
3617         if ( trie->maxlen ) {
3618             NEXT_OFF( convert ) = (U16)(tail - convert);
3619             ARG_SET( convert, data_slot );
3620             /* Store the offset to the first unabsorbed branch in
3621                jump[0], which is otherwise unused by the jump logic.
3622                We use this when dumping a trie and during optimisation. */
3623             if (trie->jump)
3624                 trie->jump[0] = (U16)(nextbranch - convert);
3625
3626             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3627              *   and there is a bitmap
3628              *   and the first "jump target" node we found leaves enough room
3629              * then convert the TRIE node into a TRIEC node, with the bitmap
3630              * embedded inline in the opcode - this is hypothetically faster.
3631              */
3632             if ( !trie->states[trie->startstate].wordnum
3633                  && trie->bitmap
3634                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3635             {
3636                 OP( convert ) = TRIEC;
3637                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3638                 PerlMemShared_free(trie->bitmap);
3639                 trie->bitmap= NULL;
3640             } else
3641                 OP( convert ) = TRIE;
3642
3643             /* store the type in the flags */
3644             convert->flags = nodetype;
3645             DEBUG_r({
3646             optimize = convert
3647                       + NODE_STEP_REGNODE
3648                       + regarglen[ OP( convert ) ];
3649             });
3650             /* XXX We really should free up the resource in trie now,
3651                    as we won't use them - (which resources?) dmq */
3652         }
3653         /* needed for dumping*/
3654         DEBUG_r(if (optimize) {
3655             regnode *opt = convert;
3656
3657             while ( ++opt < optimize) {
3658                 Set_Node_Offset_Length(opt, 0, 0);
3659             }
3660             /*
3661                 Try to clean up some of the debris left after the
3662                 optimisation.
3663              */
3664             while( optimize < jumper ) {
3665                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3666                 OP( optimize ) = OPTIMIZED;
3667                 Set_Node_Offset_Length(optimize, 0, 0);
3668                 optimize++;
3669             }
3670             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3671         });
3672     } /* end node insert */
3673
3674     /*  Finish populating the prev field of the wordinfo array.  Walk back
3675      *  from each accept state until we find another accept state, and if
3676      *  so, point the first word's .prev field at the second word. If the
3677      *  second already has a .prev field set, stop now. This will be the
3678      *  case either if we've already processed that word's accept state,
3679      *  or that state had multiple words, and the overspill words were
3680      *  already linked up earlier.
3681      */
3682     {
3683         U16 word;
3684         U32 state;
3685         U16 prev;
3686
3687         for (word=1; word <= trie->wordcount; word++) {
3688             prev = 0;
3689             if (trie->wordinfo[word].prev)
3690                 continue;
3691             state = trie->wordinfo[word].accept;
3692             while (state) {
3693                 state = prev_states[state];
3694                 if (!state)
3695                     break;
3696                 prev = trie->states[state].wordnum;
3697                 if (prev)
3698                     break;
3699             }
3700             trie->wordinfo[word].prev = prev;
3701         }
3702         Safefree(prev_states);
3703     }
3704
3705
3706     /* and now dump out the compressed format */
3707     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3708
3709     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3710 #ifdef DEBUGGING
3711     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3712     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3713 #else
3714     SvREFCNT_dec_NN(revcharmap);
3715 #endif
3716     return trie->jump
3717            ? MADE_JUMP_TRIE
3718            : trie->startstate>1
3719              ? MADE_EXACT_TRIE
3720              : MADE_TRIE;
3721 }
3722
3723 STATIC regnode *
3724 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3725 {
3726 /* The Trie is constructed and compressed now so we can build a fail array if
3727  * it's needed
3728
3729    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3730    3.32 in the
3731    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3732    Ullman 1985/88
3733    ISBN 0-201-10088-6
3734
3735    We find the fail state for each state in the trie, this state is the longest
3736    proper suffix of the current state's 'word' that is also a proper prefix of
3737    another word in our trie. State 1 represents the word '' and is thus the
3738    default fail state. This allows the DFA not to have to restart after its
3739    tried and failed a word at a given point, it simply continues as though it
3740    had been matching the other word in the first place.
3741    Consider
3742       'abcdgu'=~/abcdefg|cdgu/
3743    When we get to 'd' we are still matching the first word, we would encounter
3744    'g' which would fail, which would bring us to the state representing 'd' in
3745    the second word where we would try 'g' and succeed, proceeding to match
3746    'cdgu'.
3747  */
3748  /* add a fail transition */
3749     const U32 trie_offset = ARG(source);
3750     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3751     U32 *q;
3752     const U32 ucharcount = trie->uniquecharcount;
3753     const U32 numstates = trie->statecount;
3754     const U32 ubound = trie->lasttrans + ucharcount;
3755     U32 q_read = 0;
3756     U32 q_write = 0;
3757     U32 charid;
3758     U32 base = trie->states[ 1 ].trans.base;
3759     U32 *fail;
3760     reg_ac_data *aho;
3761     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3762     regnode *stclass;
3763     GET_RE_DEBUG_FLAGS_DECL;
3764
3765     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3766     PERL_UNUSED_CONTEXT;
3767 #ifndef DEBUGGING
3768     PERL_UNUSED_ARG(depth);
3769 #endif
3770
3771     if ( OP(source) == TRIE ) {
3772         struct regnode_1 *op = (struct regnode_1 *)
3773             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3774         StructCopy(source, op, struct regnode_1);
3775         stclass = (regnode *)op;
3776     } else {
3777         struct regnode_charclass *op = (struct regnode_charclass *)
3778             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3779         StructCopy(source, op, struct regnode_charclass);
3780         stclass = (regnode *)op;
3781     }
3782     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3783
3784     ARG_SET( stclass, data_slot );
3785     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3786     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3787     aho->trie=trie_offset;
3788     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3789     Copy( trie->states, aho->states, numstates, reg_trie_state );
3790     Newx( q, numstates, U32);
3791     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3792     aho->refcount = 1;
3793     fail = aho->fail;
3794     /* initialize fail[0..1] to be 1 so that we always have
3795        a valid final fail state */
3796     fail[ 0 ] = fail[ 1 ] = 1;
3797
3798     for ( charid = 0; charid < ucharcount ; charid++ ) {
3799         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3800         if ( newstate ) {
3801             q[ q_write ] = newstate;
3802             /* set to point at the root */
3803             fail[ q[ q_write++ ] ]=1;
3804         }
3805     }
3806     while ( q_read < q_write) {
3807         const U32 cur = q[ q_read++ % numstates ];
3808         base = trie->states[ cur ].trans.base;
3809
3810         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3811             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3812             if (ch_state) {
3813                 U32 fail_state = cur;
3814                 U32 fail_base;
3815                 do {
3816                     fail_state = fail[ fail_state ];
3817                     fail_base = aho->states[ fail_state ].trans.base;
3818                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3819
3820                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3821                 fail[ ch_state ] = fail_state;
3822                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3823                 {
3824                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3825                 }
3826                 q[ q_write++ % numstates] = ch_state;
3827             }
3828         }
3829     }
3830     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3831        when we fail in state 1, this allows us to use the
3832        charclass scan to find a valid start char. This is based on the principle
3833        that theres a good chance the string being searched contains lots of stuff
3834        that cant be a start char.
3835      */
3836     fail[ 0 ] = fail[ 1 ] = 0;
3837     DEBUG_TRIE_COMPILE_r({
3838         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3839                       depth, (UV)numstates
3840         );
3841         for( q_read=1; q_read<numstates; q_read++ ) {
3842             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3843         }
3844         Perl_re_printf( aTHX_  "\n");
3845     });
3846     Safefree(q);
3847     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3848     return stclass;
3849 }
3850
3851
3852 /* The below joins as many adjacent EXACTish nodes as possible into a single
3853  * one.  The regop may be changed if the node(s) contain certain sequences that
3854  * require special handling.  The joining is only done if:
3855  * 1) there is room in the current conglomerated node to entirely contain the
3856  *    next one.
3857  * 2) they are compatible node types
3858  *
3859  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3860  * these get optimized out
3861  *
3862  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3863  * as possible, even if that means splitting an existing node so that its first
3864  * part is moved to the preceeding node.  This would maximise the efficiency of
3865  * memEQ during matching.
3866  *
3867  * If a node is to match under /i (folded), the number of characters it matches
3868  * can be different than its character length if it contains a multi-character
3869  * fold.  *min_subtract is set to the total delta number of characters of the
3870  * input nodes.
3871  *
3872  * And *unfolded_multi_char is set to indicate whether or not the node contains
3873  * an unfolded multi-char fold.  This happens when it won't be known until
3874  * runtime whether the fold is valid or not; namely
3875  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3876  *      target string being matched against turns out to be UTF-8 is that fold
3877  *      valid; or
3878  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3879  *      runtime.
3880  * (Multi-char folds whose components are all above the Latin1 range are not
3881  * run-time locale dependent, and have already been folded by the time this
3882  * function is called.)
3883  *
3884  * This is as good a place as any to discuss the design of handling these
3885  * multi-character fold sequences.  It's been wrong in Perl for a very long
3886  * time.  There are three code points in Unicode whose multi-character folds
3887  * were long ago discovered to mess things up.  The previous designs for
3888  * dealing with these involved assigning a special node for them.  This
3889  * approach doesn't always work, as evidenced by this example:
3890  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3891  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3892  * would match just the \xDF, it won't be able to handle the case where a
3893  * successful match would have to cross the node's boundary.  The new approach
3894  * that hopefully generally solves the problem generates an EXACTFUP node
3895  * that is "sss" in this case.
3896  *
3897  * It turns out that there are problems with all multi-character folds, and not
3898  * just these three.  Now the code is general, for all such cases.  The
3899  * approach taken is:
3900  * 1)   This routine examines each EXACTFish node that could contain multi-
3901  *      character folded sequences.  Since a single character can fold into
3902  *      such a sequence, the minimum match length for this node is less than
3903  *      the number of characters in the node.  This routine returns in
3904  *      *min_subtract how many characters to subtract from the the actual
3905  *      length of the string to get a real minimum match length; it is 0 if
3906  *      there are no multi-char foldeds.  This delta is used by the caller to
3907  *      adjust the min length of the match, and the delta between min and max,
3908  *      so that the optimizer doesn't reject these possibilities based on size
3909  *      constraints.
3910  *
3911  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3912  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
3913  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3914  *      EXACTFU nodes.  The node type of such nodes is then changed to
3915  *      EXACTFUP, indicating it is problematic, and needs careful handling.
3916  *      (The procedures in step 1) above are sufficient to handle this case in
3917  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
3918  *      the only case where there is a possible fold length change in non-UTF-8
3919  *      patterns.  By reserving a special node type for problematic cases, the
3920  *      far more common regular EXACTFU nodes can be processed faster.
3921  *      regexec.c takes advantage of this.
3922  *
3923  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3924  *      problematic cases.   These all only occur when the pattern is not
3925  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
3926  *      length change, it handles the situation where the string cannot be
3927  *      entirely folded.  The strings in an EXACTFish node are folded as much
3928  *      as possible during compilation in regcomp.c.  This saves effort in
3929  *      regex matching.  By using an EXACTFUP node when it is not possible to
3930  *      fully fold at compile time, regexec.c can know that everything in an
3931  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
3932  *      case where folding in EXACTFU nodes can't be done at compile time is
3933  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
3934  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
3935  *      handle two very different cases.  Alternatively, there could have been
3936  *      a node type where there are length changes, one for unfolded, and one
3937  *      for both.  If yet another special case needed to be created, the number
3938  *      of required node types would have to go to 7.  khw figures that even
3939  *      though there are plenty of node types to spare, that the maintenance
3940  *      cost wasn't worth the small speedup of doing it that way, especially
3941  *      since he thinks the MICRO SIGN is rarely encountered in practice.
3942  *
3943  *      There are other cases where folding isn't done at compile time, but
3944  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
3945  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
3946  *      changes.  Some folds in EXACTF depend on if the runtime target string
3947  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
3948  *      when no fold in it depends on the UTF-8ness of the target string.)
3949  *
3950  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3951  *      validity of the fold won't be known until runtime, and so must remain
3952  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
3953  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3954  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3955  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3956  *      The reason this is a problem is that the optimizer part of regexec.c
3957  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3958  *      that a character in the pattern corresponds to at most a single
3959  *      character in the target string.  (And I do mean character, and not byte
3960  *      here, unlike other parts of the documentation that have never been
3961  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
3962  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3963  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
3964  *      EXACTFL nodes, violate the assumption, and they are the only instances
3965  *      where it is violated.  I'm reluctant to try to change the assumption,
3966  *      as the code involved is impenetrable to me (khw), so instead the code
3967  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
3968  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3969  *      boolean indicating whether or not the node contains such a fold.  When
3970  *      it is true, the caller sets a flag that later causes the optimizer in
3971  *      this file to not set values for the floating and fixed string lengths,
3972  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3973  *      assumption.  Thus, there is no optimization based on string lengths for
3974  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3975  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
3976  *      assumption is wrong only in these cases is that all other non-UTF-8
3977  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3978  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3979  *      EXACTF nodes because we don't know at compile time if it actually
3980  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3981  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3982  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
3983  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3984  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3985  *      string would require the pattern to be forced into UTF-8, the overhead
3986  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3987  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3988  *      locale.)
3989  *
3990  *      Similarly, the code that generates tries doesn't currently handle
3991  *      not-already-folded multi-char folds, and it looks like a pain to change
3992  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
3993  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
3994  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
3995  *      using /iaa matching will be doing so almost entirely with ASCII
3996  *      strings, so this should rarely be encountered in practice */
3997
3998 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
3999     if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
4000                                       && OP(scan) != LEXACT_REQ8)  \
4001         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
4002
4003 STATIC U32
4004 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4005                    UV *min_subtract, bool *unfolded_multi_char,
4006                    U32 flags, regnode *val, U32 depth)
4007 {
4008     /* Merge several consecutive EXACTish nodes into one. */
4009
4010     regnode *n = regnext(scan);
4011     U32 stringok = 1;
4012     regnode *next = scan + NODE_SZ_STR(scan);
4013     U32 merged = 0;
4014     U32 stopnow = 0;
4015 #ifdef DEBUGGING
4016     regnode *stop = scan;
4017     GET_RE_DEBUG_FLAGS_DECL;
4018 #else
4019     PERL_UNUSED_ARG(depth);
4020 #endif
4021
4022     PERL_ARGS_ASSERT_JOIN_EXACT;
4023 #ifndef EXPERIMENTAL_INPLACESCAN
4024     PERL_UNUSED_ARG(flags);
4025     PERL_UNUSED_ARG(val);
4026 #endif
4027     DEBUG_PEEP("join", scan, depth, 0);
4028
4029     assert(PL_regkind[OP(scan)] == EXACT);
4030
4031     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4032      * EXACT ones that are mergeable to the current one. */
4033     while (    n
4034            && (    PL_regkind[OP(n)] == NOTHING
4035                || (stringok && PL_regkind[OP(n)] == EXACT))
4036            && NEXT_OFF(n)
4037            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4038     {
4039
4040         if (OP(n) == TAIL || n > next)
4041             stringok = 0;
4042         if (PL_regkind[OP(n)] == NOTHING) {
4043             DEBUG_PEEP("skip:", n, depth, 0);
4044             NEXT_OFF(scan) += NEXT_OFF(n);
4045             next = n + NODE_STEP_REGNODE;
4046 #ifdef DEBUGGING
4047             if (stringok)
4048                 stop = n;
4049 #endif
4050             n = regnext(n);
4051         }
4052         else if (stringok) {
4053             const unsigned int oldl = STR_LEN(scan);
4054             regnode * const nnext = regnext(n);
4055
4056             /* XXX I (khw) kind of doubt that this works on platforms (should
4057              * Perl ever run on one) where U8_MAX is above 255 because of lots
4058              * of other assumptions */
4059             /* Don't join if the sum can't fit into a single node */
4060             if (oldl + STR_LEN(n) > U8_MAX)
4061                 break;
4062
4063             /* Joining something that requires UTF-8 with something that
4064              * doesn't, means the result requires UTF-8. */
4065             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4066                 OP(scan) = EXACT_REQ8;
4067             }
4068             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4069                 ;   /* join is compatible, no need to change OP */
4070             }
4071             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4072                 OP(scan) = EXACTFU_REQ8;
4073             }
4074             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4075                 ;   /* join is compatible, no need to change OP */
4076             }
4077             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4078                 ;   /* join is compatible, no need to change OP */
4079             }
4080             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4081
4082                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4083                   * which can join with EXACTFU ones.  We check for this case
4084                   * here.  These need to be resolved to either EXACTFU or
4085                   * EXACTF at joining time.  They have nothing in them that
4086                   * would forbid them from being the more desirable EXACTFU
4087                   * nodes except that they begin and/or end with a single [Ss].
4088                   * The reason this is problematic is because they could be
4089                   * joined in this loop with an adjacent node that ends and/or
4090                   * begins with [Ss] which would then form the sequence 'ss',
4091                   * which matches differently under /di than /ui, in which case
4092                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4093                   * formed, the nodes get absorbed into any adjacent EXACTFU
4094                   * node.  And if the only adjacent node is EXACTF, they get
4095                   * absorbed into that, under the theory that a longer node is
4096                   * better than two shorter ones, even if one is EXACTFU.  Note
4097                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4098                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4099
4100                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4101
4102                     /* Here the joined node would end with 's'.  If the node
4103                      * following the combination is an EXACTF one, it's better to
4104                      * join this trailing edge 's' node with that one, leaving the
4105                      * current one in 'scan' be the more desirable EXACTFU */
4106                     if (OP(nnext) == EXACTF) {
4107                         break;
4108                     }
4109
4110                     OP(scan) = EXACTFU_S_EDGE;
4111
4112                 }   /* Otherwise, the beginning 's' of the 2nd node just
4113                        becomes an interior 's' in 'scan' */
4114             }
4115             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4116                 ;   /* join is compatible, no need to change OP */
4117             }
4118             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4119
4120                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4121                  * nodes.  But the latter nodes can be also joined with EXACTFU
4122                  * ones, and that is a better outcome, so if the node following
4123                  * 'n' is EXACTFU, quit now so that those two can be joined
4124                  * later */
4125                 if (OP(nnext) == EXACTFU) {
4126                     break;
4127                 }
4128
4129                 /* The join is compatible, and the combined node will be
4130                  * EXACTF.  (These don't care if they begin or end with 's' */
4131             }
4132             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4133                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4134                     && STRING(n)[0] == 's')
4135                 {
4136                     /* When combined, we have the sequence 'ss', which means we
4137                      * have to remain /di */
4138                     OP(scan) = EXACTF;
4139                 }
4140             }
4141             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4142                 if (STRING(n)[0] == 's') {
4143                     ;   /* Here the join is compatible and the combined node
4144                            starts with 's', no need to change OP */
4145                 }
4146                 else {  /* Now the trailing 's' is in the interior */
4147                     OP(scan) = EXACTFU;
4148                 }
4149             }
4150             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4151
4152                 /* The join is compatible, and the combined node will be
4153                  * EXACTF.  (These don't care if they begin or end with 's' */
4154                 OP(scan) = EXACTF;
4155             }
4156             else if (OP(scan) != OP(n)) {
4157
4158                 /* The only other compatible joinings are the same node type */
4159                 break;
4160             }
4161
4162             DEBUG_PEEP("merg", n, depth, 0);
4163             merged++;
4164
4165             NEXT_OFF(scan) += NEXT_OFF(n);
4166             setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
4167             next = n + NODE_SZ_STR(n);
4168             /* Now we can overwrite *n : */
4169             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4170 #ifdef DEBUGGING
4171             stop = next - 1;
4172 #endif
4173             n = nnext;
4174             if (stopnow) break;
4175         }
4176
4177 #ifdef EXPERIMENTAL_INPLACESCAN
4178         if (flags && !NEXT_OFF(n)) {
4179             DEBUG_PEEP("atch", val, depth, 0);
4180             if (reg_off_by_arg[OP(n)]) {
4181                 ARG_SET(n, val - n);
4182             }
4183             else {
4184                 NEXT_OFF(n) = val - n;
4185             }
4186             stopnow = 1;
4187         }
4188 #endif
4189     }
4190
4191     /* This temporary node can now be turned into EXACTFU, and must, as
4192      * regexec.c doesn't handle it */
4193     if (OP(scan) == EXACTFU_S_EDGE) {
4194         OP(scan) = EXACTFU;
4195     }
4196
4197     *min_subtract = 0;
4198     *unfolded_multi_char = FALSE;
4199
4200     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4201      * can now analyze for sequences of problematic code points.  (Prior to
4202      * this final joining, sequences could have been split over boundaries, and
4203      * hence missed).  The sequences only happen in folding, hence for any
4204      * non-EXACT EXACTish node */
4205     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4206         U8* s0 = (U8*) STRING(scan);
4207         U8* s = s0;
4208         U8* s_end = s0 + STR_LEN(scan);
4209
4210         int total_count_delta = 0;  /* Total delta number of characters that
4211                                        multi-char folds expand to */
4212
4213         /* One pass is made over the node's string looking for all the
4214          * possibilities.  To avoid some tests in the loop, there are two main
4215          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4216          * non-UTF-8 */
4217         if (UTF) {
4218             U8* folded = NULL;
4219
4220             if (OP(scan) == EXACTFL) {
4221                 U8 *d;
4222
4223                 /* An EXACTFL node would already have been changed to another
4224                  * node type unless there is at least one character in it that
4225                  * is problematic; likely a character whose fold definition
4226                  * won't be known until runtime, and so has yet to be folded.
4227                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4228                  * to handle the UTF-8 case, we need to create a temporary
4229                  * folded copy using UTF-8 locale rules in order to analyze it.
4230                  * This is because our macros that look to see if a sequence is
4231                  * a multi-char fold assume everything is folded (otherwise the
4232                  * tests in those macros would be too complicated and slow).
4233                  * Note that here, the non-problematic folds will have already
4234                  * been done, so we can just copy such characters.  We actually
4235                  * don't completely fold the EXACTFL string.  We skip the
4236                  * unfolded multi-char folds, as that would just create work
4237                  * below to figure out the size they already are */
4238
4239                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4240                 d = folded;
4241                 while (s < s_end) {
4242                     STRLEN s_len = UTF8SKIP(s);
4243                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4244                         Copy(s, d, s_len, U8);
4245                         d += s_len;
4246                     }
4247                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4248                         *unfolded_multi_char = TRUE;
4249                         Copy(s, d, s_len, U8);
4250                         d += s_len;
4251                     }
4252                     else if (isASCII(*s)) {
4253                         *(d++) = toFOLD(*s);
4254                     }
4255                     else {
4256                         STRLEN len;
4257                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4258                         d += len;
4259                     }
4260                     s += s_len;
4261                 }
4262
4263                 /* Point the remainder of the routine to look at our temporary
4264                  * folded copy */
4265                 s = folded;
4266                 s_end = d;
4267             } /* End of creating folded copy of EXACTFL string */
4268
4269             /* Examine the string for a multi-character fold sequence.  UTF-8
4270              * patterns have all characters pre-folded by the time this code is
4271              * executed */
4272             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4273                                      length sequence we are looking for is 2 */
4274             {
4275                 int count = 0;  /* How many characters in a multi-char fold */
4276                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4277                 if (! len) {    /* Not a multi-char fold: get next char */
4278                     s += UTF8SKIP(s);
4279                     continue;
4280                 }
4281
4282                 { /* Here is a generic multi-char fold. */
4283                     U8* multi_end  = s + len;
4284
4285                     /* Count how many characters are in it.  In the case of
4286                      * /aa, no folds which contain ASCII code points are
4287                      * allowed, so check for those, and skip if found. */
4288                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4289                         count = utf8_length(s, multi_end);
4290                         s = multi_end;
4291                     }
4292                     else {
4293                         while (s < multi_end) {
4294                             if (isASCII(*s)) {
4295                                 s++;
4296                                 goto next_iteration;
4297                             }
4298                             else {
4299                                 s += UTF8SKIP(s);
4300                             }
4301                             count++;
4302                         }
4303                     }
4304                 }
4305
4306                 /* The delta is how long the sequence is minus 1 (1 is how long
4307                  * the character that folds to the sequence is) */
4308                 total_count_delta += count - 1;
4309               next_iteration: ;
4310             }
4311
4312             /* We created a temporary folded copy of the string in EXACTFL
4313              * nodes.  Therefore we need to be sure it doesn't go below zero,
4314              * as the real string could be shorter */
4315             if (OP(scan) == EXACTFL) {
4316                 int total_chars = utf8_length((U8*) STRING(scan),
4317                                            (U8*) STRING(scan) + STR_LEN(scan));
4318                 if (total_count_delta > total_chars) {
4319                     total_count_delta = total_chars;
4320                 }
4321             }
4322
4323             *min_subtract += total_count_delta;
4324             Safefree(folded);
4325         }
4326         else if (OP(scan) == EXACTFAA) {
4327
4328             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4329              * fold to the ASCII range (and there are no existing ones in the
4330              * upper latin1 range).  But, as outlined in the comments preceding
4331              * this function, we need to flag any occurrences of the sharp s.
4332              * This character forbids trie formation (because of added
4333              * complexity) */
4334 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4335    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4336                                       || UNICODE_DOT_DOT_VERSION > 0)
4337             while (s < s_end) {
4338                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4339                     OP(scan) = EXACTFAA_NO_TRIE;
4340                     *unfolded_multi_char = TRUE;
4341                     break;
4342                 }
4343                 s++;
4344             }
4345         }
4346         else {
4347
4348             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4349              * folds that are all Latin1.  As explained in the comments
4350              * preceding this function, we look also for the sharp s in EXACTF
4351              * and EXACTFL nodes; it can be in the final position.  Otherwise
4352              * we can stop looking 1 byte earlier because have to find at least
4353              * two characters for a multi-fold */
4354             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4355                               ? s_end
4356                               : s_end -1;
4357
4358             while (s < upper) {
4359                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4360                 if (! len) {    /* Not a multi-char fold. */
4361                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4362                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4363                     {
4364                         *unfolded_multi_char = TRUE;
4365                     }
4366                     s++;
4367                     continue;
4368                 }
4369
4370                 if (len == 2
4371                     && isALPHA_FOLD_EQ(*s, 's')
4372                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4373                 {
4374
4375                     /* EXACTF nodes need to know that the minimum length
4376                      * changed so that a sharp s in the string can match this
4377                      * ss in the pattern, but they remain EXACTF nodes, as they
4378                      * won't match this unless the target string is is UTF-8,
4379                      * which we don't know until runtime.  EXACTFL nodes can't
4380                      * transform into EXACTFU nodes */
4381                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4382                         OP(scan) = EXACTFUP;
4383                     }
4384                 }
4385
4386                 *min_subtract += len - 1;
4387                 s += len;
4388             }
4389 #endif
4390         }
4391
4392         if (     STR_LEN(scan) == 1
4393             &&   isALPHA_A(* STRING(scan))
4394             &&  (         OP(scan) == EXACTFAA
4395                  || (     OP(scan) == EXACTFU
4396                      && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4397         {
4398             U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4399
4400             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4401              * with the mask set to the complement of the bit that differs
4402              * between upper and lower case, and the lowest code point of the
4403              * pair (which the '&' forces) */
4404             OP(scan) = ANYOFM;
4405             ARG_SET(scan, *STRING(scan) & mask);
4406             FLAGS(scan) = mask;
4407         }
4408     }
4409
4410 #ifdef DEBUGGING
4411     /* Allow dumping but overwriting the collection of skipped
4412      * ops and/or strings with fake optimized ops */
4413     n = scan + NODE_SZ_STR(scan);
4414     while (n <= stop) {
4415         OP(n) = OPTIMIZED;
4416         FLAGS(n) = 0;
4417         NEXT_OFF(n) = 0;
4418         n++;
4419     }
4420 #endif
4421     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4422     return stopnow;
4423 }
4424
4425 /* REx optimizer.  Converts nodes into quicker variants "in place".
4426    Finds fixed substrings.  */
4427
4428 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4429    to the position after last scanned or to NULL. */
4430
4431 #define INIT_AND_WITHP \
4432     assert(!and_withp); \
4433     Newx(and_withp, 1, regnode_ssc); \
4434     SAVEFREEPV(and_withp)
4435
4436
4437 static void
4438 S_unwind_scan_frames(pTHX_ const void *p)
4439 {
4440     scan_frame *f= (scan_frame *)p;
4441     do {
4442         scan_frame *n= f->next_frame;
4443         Safefree(f);
4444         f= n;
4445     } while (f);
4446 }
4447
4448 /* the return from this sub is the minimum length that could possibly match */
4449 STATIC SSize_t
4450 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4451                         SSize_t *minlenp, SSize_t *deltap,
4452                         regnode *last,
4453                         scan_data_t *data,
4454                         I32 stopparen,
4455                         U32 recursed_depth,
4456                         regnode_ssc *and_withp,
4457                         U32 flags, U32 depth)
4458                         /* scanp: Start here (read-write). */
4459                         /* deltap: Write maxlen-minlen here. */
4460                         /* last: Stop before this one. */
4461                         /* data: string data about the pattern */
4462                         /* stopparen: treat close N as END */
4463                         /* recursed: which subroutines have we recursed into */
4464                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4465 {
4466     dVAR;
4467     /* There must be at least this number of characters to match */
4468     SSize_t min = 0;
4469     I32 pars = 0, code;
4470     regnode *scan = *scanp, *next;
4471     SSize_t delta = 0;
4472     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4473     int is_inf_internal = 0;            /* The studied chunk is infinite */
4474     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4475     scan_data_t data_fake;
4476     SV *re_trie_maxbuff = NULL;
4477     regnode *first_non_open = scan;
4478     SSize_t stopmin = SSize_t_MAX;
4479     scan_frame *frame = NULL;
4480     GET_RE_DEBUG_FLAGS_DECL;
4481
4482     PERL_ARGS_ASSERT_STUDY_CHUNK;
4483     RExC_study_started= 1;
4484
4485     Zero(&data_fake, 1, scan_data_t);
4486
4487     if ( depth == 0 ) {
4488         while (first_non_open && OP(first_non_open) == OPEN)
4489             first_non_open=regnext(first_non_open);
4490     }
4491
4492
4493   fake_study_recurse:
4494     DEBUG_r(
4495         RExC_study_chunk_recursed_count++;
4496     );
4497     DEBUG_OPTIMISE_MORE_r(
4498     {
4499         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4500             depth, (long)stopparen,
4501             (unsigned long)RExC_study_chunk_recursed_count,
4502             (unsigned long)depth, (unsigned long)recursed_depth,
4503             scan,
4504             last);
4505         if (recursed_depth) {
4506             U32 i;
4507             U32 j;
4508             for ( j = 0 ; j < recursed_depth ; j++ ) {
4509                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4510                     if (
4511                         PAREN_TEST(RExC_study_chunk_recursed +
4512                                    ( j * RExC_study_chunk_recursed_bytes), i )
4513                         && (
4514                             !j ||
4515                             !PAREN_TEST(RExC_study_chunk_recursed +
4516                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4517                         )
4518                     ) {
4519                         Perl_re_printf( aTHX_ " %d",(int)i);
4520                         break;
4521                     }
4522                 }
4523                 if ( j + 1 < recursed_depth ) {
4524                     Perl_re_printf( aTHX_  ",");
4525                 }
4526             }
4527         }
4528         Perl_re_printf( aTHX_ "\n");
4529     }
4530     );
4531     while ( scan && OP(scan) != END && scan < last ){
4532         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4533                                    node length to get a real minimum (because
4534                                    the folded version may be shorter) */
4535         bool unfolded_multi_char = FALSE;
4536         /* Peephole optimizer: */
4537         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4538         DEBUG_PEEP("Peep", scan, depth, flags);
4539
4540
4541         /* The reason we do this here is that we need to deal with things like
4542          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4543          * parsing code, as each (?:..) is handled by a different invocation of
4544          * reg() -- Yves
4545          */
4546         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4547
4548         /* Follow the next-chain of the current node and optimize
4549            away all the NOTHINGs from it.  */
4550         if (OP(scan) != CURLYX) {
4551             const int max = (reg_off_by_arg[OP(scan)]
4552                             ? I32_MAX
4553                               /* I32 may be smaller than U16 on CRAYs! */
4554                             : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4555             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4556             int noff;
4557             regnode *n = scan;
4558
4559             /* Skip NOTHING and LONGJMP. */
4560             while (   (n = regnext(n))
4561                    && (   (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4562                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4563                    && off + noff < max)
4564                 off += noff;
4565             if (reg_off_by_arg[OP(scan)])
4566                 ARG(scan) = off;
4567             else
4568                 NEXT_OFF(scan) = off;
4569         }
4570
4571         /* The principal pseudo-switch.  Cannot be a switch, since we look into
4572          * several different things.  */
4573         if ( OP(scan) == DEFINEP ) {
4574             SSize_t minlen = 0;
4575             SSize_t deltanext = 0;
4576             SSize_t fake_last_close = 0;
4577             I32 f = SCF_IN_DEFINE;
4578
4579             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4580             scan = regnext(scan);
4581             assert( OP(scan) == IFTHEN );
4582             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4583
4584             data_fake.last_closep= &fake_last_close;
4585             minlen = *minlenp;
4586             next = regnext(scan);
4587             scan = NEXTOPER(NEXTOPER(scan));
4588             DEBUG_PEEP("scan", scan, depth, flags);
4589             DEBUG_PEEP("next", next, depth, flags);
4590
4591             /* we suppose the run is continuous, last=next...
4592              * NOTE we dont use the return here! */
4593             /* DEFINEP study_chunk() recursion */
4594             (void)study_chunk(pRExC_state, &scan, &minlen,
4595                               &deltanext, next, &data_fake, stopparen,
4596                               recursed_depth, NULL, f, depth+1);
4597
4598             scan = next;
4599         } else
4600         if (
4601             OP(scan) == BRANCH  ||
4602             OP(scan) == BRANCHJ ||
4603             OP(scan) == IFTHEN
4604         ) {
4605             next = regnext(scan);
4606             code = OP(scan);
4607
4608             /* The op(next)==code check below is to see if we
4609              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4610              * IFTHEN is special as it might not appear in pairs.
4611              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4612              * we dont handle it cleanly. */
4613             if (OP(next) == code || code == IFTHEN) {
4614                 /* NOTE - There is similar code to this block below for
4615                  * handling TRIE nodes on a re-study.  If you change stuff here
4616                  * check there too. */
4617                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4618                 regnode_ssc accum;
4619                 regnode * const startbranch=scan;
4620
4621                 if (flags & SCF_DO_SUBSTR) {
4622                     /* Cannot merge strings after this. */
4623                     scan_commit(pRExC_state, data, minlenp, is_inf);
4624                 }
4625
4626                 if (flags & SCF_DO_STCLASS)
4627                     ssc_init_zero(pRExC_state, &accum);
4628
4629                 while (OP(scan) == code) {
4630                     SSize_t deltanext, minnext, fake;
4631                     I32 f = 0;
4632                     regnode_ssc this_class;
4633
4634                     DEBUG_PEEP("Branch", scan, depth, flags);
4635
4636                     num++;
4637                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4638                     if (data) {
4639                         data_fake.whilem_c = data->whilem_c;
4640                         data_fake.last_closep = data->last_closep;
4641                     }
4642                     else
4643                         data_fake.last_closep = &fake;
4644
4645                     data_fake.pos_delta = delta;
4646                     next = regnext(scan);
4647
4648                     scan = NEXTOPER(scan); /* everything */
4649                     if (code != BRANCH)    /* everything but BRANCH */
4650                         scan = NEXTOPER(scan);
4651
4652                     if (flags & SCF_DO_STCLASS) {
4653                         ssc_init(pRExC_state, &this_class);
4654                         data_fake.start_class = &this_class;
4655                         f = SCF_DO_STCLASS_AND;
4656                     }
4657                     if (flags & SCF_WHILEM_VISITED_POS)
4658                         f |= SCF_WHILEM_VISITED_POS;
4659
4660                     /* we suppose the run is continuous, last=next...*/
4661                     /* recurse study_chunk() for each BRANCH in an alternation */
4662                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4663                                       &deltanext, next, &data_fake, stopparen,
4664                                       recursed_depth, NULL, f, depth+1);
4665
4666                     if (min1 > minnext)
4667                         min1 = minnext;
4668                     if (deltanext == SSize_t_MAX) {
4669                         is_inf = is_inf_internal = 1;
4670                         max1 = SSize_t_MAX;
4671                     } else if (max1 < minnext + deltanext)
4672                         max1 = minnext + deltanext;
4673                     scan = next;
4674                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4675                         pars++;
4676                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4677                         if ( stopmin > minnext)
4678                             stopmin = min + min1;
4679                         flags &= ~SCF_DO_SUBSTR;
4680                         if (data)
4681                             data->flags |= SCF_SEEN_ACCEPT;
4682                     }
4683                     if (data) {
4684                         if (data_fake.flags & SF_HAS_EVAL)
4685                             data->flags |= SF_HAS_EVAL;
4686                         data->whilem_c = data_fake.whilem_c;
4687                     }
4688                     if (flags & SCF_DO_STCLASS)
4689                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4690                 }
4691                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4692                     min1 = 0;
4693                 if (flags & SCF_DO_SUBSTR) {
4694                     data->pos_min += min1;
4695                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4696                         data->pos_delta = SSize_t_MAX;
4697                     else
4698                         data->pos_delta += max1 - min1;
4699                     if (max1 != min1 || is_inf)
4700                         data->cur_is_floating = 1;
4701                 }
4702                 min += min1;
4703                 if (delta == SSize_t_MAX
4704                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4705                     delta = SSize_t_MAX;
4706                 else
4707                     delta += max1 - min1;
4708                 if (flags & SCF_DO_STCLASS_OR) {
4709                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4710                     if (min1) {
4711                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4712                         flags &= ~SCF_DO_STCLASS;
4713                     }
4714                 }
4715                 else if (flags & SCF_DO_STCLASS_AND) {
4716                     if (min1) {
4717                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4718                         flags &= ~SCF_DO_STCLASS;
4719                     }
4720                     else {
4721                         /* Switch to OR mode: cache the old value of
4722                          * data->start_class */
4723                         INIT_AND_WITHP;
4724                         StructCopy(data->start_class, and_withp, regnode_ssc);
4725                         flags &= ~SCF_DO_STCLASS_AND;
4726                         StructCopy(&accum, data->start_class, regnode_ssc);
4727                         flags |= SCF_DO_STCLASS_OR;
4728                     }
4729                 }
4730
4731                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4732                         OP( startbranch ) == BRANCH )
4733                 {
4734                 /* demq.
4735
4736                    Assuming this was/is a branch we are dealing with: 'scan'
4737                    now points at the item that follows the branch sequence,
4738                    whatever it is. We now start at the beginning of the
4739                    sequence and look for subsequences of
4740
4741                    BRANCH->EXACT=>x1
4742                    BRANCH->EXACT=>x2
4743                    tail
4744
4745                    which would be constructed from a pattern like
4746                    /A|LIST|OF|WORDS/
4747
4748                    If we can find such a subsequence we need to turn the first
4749                    element into a trie and then add the subsequent branch exact
4750                    strings to the trie.
4751
4752                    We have two cases
4753
4754                      1. patterns where the whole set of branches can be
4755                         converted.
4756
4757                      2. patterns where only a subset can be converted.
4758
4759                    In case 1 we can replace the whole set with a single regop
4760                    for the trie. In case 2 we need to keep the start and end
4761                    branches so
4762
4763                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4764                      becomes BRANCH TRIE; BRANCH X;
4765
4766                   There is an additional case, that being where there is a
4767                   common prefix, which gets split out into an EXACT like node
4768                   preceding the TRIE node.
4769
4770                   If x(1..n)==tail then we can do a simple trie, if not we make
4771                   a "jump" trie, such that when we match the appropriate word
4772                   we "jump" to the appropriate tail node. Essentially we turn
4773                   a nested if into a case structure of sorts.
4774
4775                 */
4776
4777                     int made=0;
4778                     if (!re_trie_maxbuff) {
4779                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4780                         if (!SvIOK(re_trie_maxbuff))
4781                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4782                     }
4783                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4784                         regnode *cur;
4785                         regnode *first = (regnode *)NULL;
4786                         regnode *prev = (regnode *)NULL;
4787                         regnode *tail = scan;
4788                         U8 trietype = 0;
4789                         U32 count=0;
4790
4791                         /* var tail is used because there may be a TAIL
4792                            regop in the way. Ie, the exacts will point to the
4793                            thing following the TAIL, but the last branch will
4794                            point at the TAIL. So we advance tail. If we
4795                            have nested (?:) we may have to move through several
4796                            tails.
4797                          */
4798
4799                         while ( OP( tail ) == TAIL ) {
4800                             /* this is the TAIL generated by (?:) */
4801                             tail = regnext( tail );
4802                         }
4803
4804
4805                         DEBUG_TRIE_COMPILE_r({
4806                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4807                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4808                               depth+1,
4809                               "Looking for TRIE'able sequences. Tail node is ",
4810                               (UV) REGNODE_OFFSET(tail),
4811                               SvPV_nolen_const( RExC_mysv )
4812                             );
4813                         });
4814
4815                         /*
4816
4817                             Step through the branches
4818                                 cur represents each branch,
4819                                 noper is the first thing to be matched as part
4820                                       of that branch
4821                                 noper_next is the regnext() of that node.
4822
4823                             We normally handle a case like this
4824                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4825                             support building with NOJUMPTRIE, which restricts
4826                             the trie logic to structures like /FOO|BAR/.
4827
4828                             If noper is a trieable nodetype then the branch is
4829                             a possible optimization target. If we are building
4830                             under NOJUMPTRIE then we require that noper_next is
4831                             the same as scan (our current position in the regex
4832                             program).
4833
4834                             Once we have two or more consecutive such branches
4835                             we can create a trie of the EXACT's contents and
4836                             stitch it in place into the program.
4837
4838                             If the sequence represents all of the branches in
4839                             the alternation we replace the entire thing with a
4840                             single TRIE node.
4841
4842                             Otherwise when it is a subsequence we need to
4843                             stitch it in place and replace only the relevant
4844                             branches. This means the first branch has to remain
4845                             as it is used by the alternation logic, and its
4846                             next pointer, and needs to be repointed at the item
4847                             on the branch chain following the last branch we
4848                             have optimized away.
4849
4850                             This could be either a BRANCH, in which case the
4851                             subsequence is internal, or it could be the item
4852                             following the branch sequence in which case the
4853                             subsequence is at the end (which does not
4854                             necessarily mean the first node is the start of the
4855                             alternation).
4856
4857                             TRIE_TYPE(X) is a define which maps the optype to a
4858                             trietype.
4859
4860                                 optype          |  trietype
4861                                 ----------------+-----------
4862                                 NOTHING         | NOTHING
4863                                 EXACT           | EXACT
4864                                 EXACT_REQ8     | EXACT
4865                                 EXACTFU         | EXACTFU
4866                                 EXACTFU_REQ8   | EXACTFU
4867                                 EXACTFUP        | EXACTFU
4868                                 EXACTFAA        | EXACTFAA
4869                                 EXACTL          | EXACTL
4870                                 EXACTFLU8       | EXACTFLU8
4871
4872
4873                         */
4874 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4875                        ? NOTHING                                            \
4876                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
4877                          ? EXACT                                            \
4878                          : (     EXACTFU == (X)                             \
4879                               || EXACTFU_REQ8 == (X)                       \
4880                               || EXACTFUP == (X) )                          \
4881                            ? EXACTFU                                        \
4882                            : ( EXACTFAA == (X) )                            \
4883                              ? EXACTFAA                                     \
4884                              : ( EXACTL == (X) )                            \
4885                                ? EXACTL                                     \
4886                                : ( EXACTFLU8 == (X) )                       \
4887                                  ? EXACTFLU8                                \
4888                                  : 0 )
4889
4890                         /* dont use tail as the end marker for this traverse */
4891                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4892                             regnode * const noper = NEXTOPER( cur );
4893                             U8 noper_type = OP( noper );
4894                             U8 noper_trietype = TRIE_TYPE( noper_type );
4895 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4896                             regnode * const noper_next = regnext( noper );
4897                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4898                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4899 #endif
4900
4901                             DEBUG_TRIE_COMPILE_r({
4902                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4903                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4904                                    depth+1,
4905                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4906
4907                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4908                                 Perl_re_printf( aTHX_  " -> %d:%s",
4909                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4910
4911                                 if ( noper_next ) {
4912                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4913                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4914                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4915                                 }
4916                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4917                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
4918                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4919                                 );
4920                             });
4921
4922                             /* Is noper a trieable nodetype that can be merged
4923                              * with the current trie (if there is one)? */
4924                             if ( noper_trietype
4925                                   &&
4926                                   (
4927                                         ( noper_trietype == NOTHING )
4928                                         || ( trietype == NOTHING )
4929                                         || ( trietype == noper_trietype )
4930                                   )
4931 #ifdef NOJUMPTRIE
4932                                   && noper_next >= tail
4933 #endif
4934                                   && count < U16_MAX)
4935                             {
4936                                 /* Handle mergable triable node Either we are
4937                                  * the first node in a new trieable sequence,
4938                                  * in which case we do some bookkeeping,
4939                                  * otherwise we update the end pointer. */
4940                                 if ( !first ) {
4941                                     first = cur;
4942                                     if ( noper_trietype == NOTHING ) {
4943 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4944                                         regnode * const noper_next = regnext( noper );
4945                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4946                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4947 #endif
4948
4949                                         if ( noper_next_trietype ) {
4950                                             trietype = noper_next_trietype;
4951                                         } else if (noper_next_type)  {
4952                                             /* a NOTHING regop is 1 regop wide.
4953                                              * We need at least two for a trie
4954                                              * so we can't merge this in */
4955                                             first = NULL;
4956                                         }
4957                                     } else {
4958                                         trietype = noper_trietype;
4959                                     }
4960                                 } else {
4961                                     if ( trietype == NOTHING )
4962                                         trietype = noper_trietype;
4963                                     prev = cur;
4964                                 }
4965                                 if (first)
4966                                     count++;
4967                             } /* end handle mergable triable node */
4968                             else {
4969                                 /* handle unmergable node -
4970                                  * noper may either be a triable node which can
4971                                  * not be tried together with the current trie,
4972                                  * or a non triable node */
4973                                 if ( prev ) {
4974                                     /* If last is set and trietype is not
4975                                      * NOTHING then we have found at least two
4976                                      * triable branch sequences in a row of a
4977                                      * similar trietype so we can turn them
4978                                      * into a trie. If/when we allow NOTHING to
4979                                      * start a trie sequence this condition
4980                                      * will be required, and it isn't expensive
4981                                      * so we leave it in for now. */
4982                                     if ( trietype && trietype != NOTHING )
4983                                         make_trie( pRExC_state,
4984                                                 startbranch, first, cur, tail,
4985                                                 count, trietype, depth+1 );
4986                                     prev = NULL; /* note: we clear/update
4987                                                     first, trietype etc below,
4988                                                     so we dont do it here */
4989                                 }
4990                                 if ( noper_trietype
4991 #ifdef NOJUMPTRIE
4992                                      && noper_next >= tail
4993 #endif
4994                                 ){
4995                                     /* noper is triable, so we can start a new
4996                                      * trie sequence */
4997                                     count = 1;
4998                                     first = cur;
4999                                     trietype = noper_trietype;
5000                                 } else if (first) {
5001                                     /* if we already saw a first but the
5002                                      * current node is not triable then we have
5003                                      * to reset the first information. */
5004                                     count = 0;
5005                                     first = NULL;
5006                                     trietype = 0;
5007                                 }
5008                             } /* end handle unmergable node */
5009                         } /* loop over branches */
5010                         DEBUG_TRIE_COMPILE_r({
5011                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5012                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5013                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5014                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5015                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5016                                PL_reg_name[trietype]
5017                             );
5018
5019                         });
5020                         if ( prev && trietype ) {
5021                             if ( trietype != NOTHING ) {
5022                                 /* the last branch of the sequence was part of
5023                                  * a trie, so we have to construct it here
5024                                  * outside of the loop */
5025                                 made= make_trie( pRExC_state, startbranch,
5026                                                  first, scan, tail, count,
5027                                                  trietype, depth+1 );
5028 #ifdef TRIE_STUDY_OPT
5029                                 if ( ((made == MADE_EXACT_TRIE &&
5030                                      startbranch == first)
5031                                      || ( first_non_open == first )) &&
5032                                      depth==0 ) {
5033                                     flags |= SCF_TRIE_RESTUDY;
5034                                     if ( startbranch == first
5035                                          && scan >= tail )
5036                                     {
5037                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5038                                     }
5039                                 }
5040 #endif
5041                             } else {
5042                                 /* at this point we know whatever we have is a
5043                                  * NOTHING sequence/branch AND if 'startbranch'
5044                                  * is 'first' then we can turn the whole thing
5045                                  * into a NOTHING
5046                                  */
5047                                 if ( startbranch == first ) {
5048                                     regnode *opt;
5049                                     /* the entire thing is a NOTHING sequence,
5050                                      * something like this: (?:|) So we can
5051                                      * turn it into a plain NOTHING op. */
5052                                     DEBUG_TRIE_COMPILE_r({
5053                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5054                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5055                                           depth+1,
5056                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5057
5058                                     });
5059                                     OP(startbranch)= NOTHING;
5060                                     NEXT_OFF(startbranch)= tail - startbranch;
5061                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5062                                         OP(opt)= OPTIMIZED;
5063                                 }
5064                             }
5065                         } /* end if ( prev) */
5066                     } /* TRIE_MAXBUF is non zero */
5067                 } /* do trie */
5068
5069             }
5070             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5071                 scan = NEXTOPER(NEXTOPER(scan));
5072             } else                      /* single branch is optimized. */
5073                 scan = NEXTOPER(scan);
5074             continue;
5075         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5076             I32 paren = 0;
5077             regnode *start = NULL;
5078             regnode *end = NULL;
5079             U32 my_recursed_depth= recursed_depth;
5080
5081             if (OP(scan) != SUSPEND) { /* GOSUB */
5082                 /* Do setup, note this code has side effects beyond
5083                  * the rest of this block. Specifically setting
5084                  * RExC_recurse[] must happen at least once during
5085                  * study_chunk(). */
5086                 paren = ARG(scan);
5087                 RExC_recurse[ARG2L(scan)] = scan;
5088                 start = REGNODE_p(RExC_open_parens[paren]);
5089                 end   = REGNODE_p(RExC_close_parens[paren]);
5090
5091                 /* NOTE we MUST always execute the above code, even
5092                  * if we do nothing with a GOSUB */
5093                 if (
5094                     ( flags & SCF_IN_DEFINE )
5095                     ||
5096                     (
5097                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5098                         &&
5099                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5100                     )
5101                 ) {
5102                     /* no need to do anything here if we are in a define. */
5103                     /* or we are after some kind of infinite construct
5104                      * so we can skip recursing into this item.
5105                      * Since it is infinite we will not change the maxlen
5106                      * or delta, and if we miss something that might raise
5107                      * the minlen it will merely pessimise a little.
5108                      *
5109                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5110                      * might result in a minlen of 1 and not of 4,
5111                      * but this doesn't make us mismatch, just try a bit
5112                      * harder than we should.
5113                      * */
5114                     scan= regnext(scan);
5115                     continue;
5116                 }
5117
5118                 if (
5119                     !recursed_depth
5120                     ||
5121                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5122                 ) {
5123                     /* it is quite possible that there are more efficient ways
5124                      * to do this. We maintain a bitmap per level of recursion
5125                      * of which patterns we have entered so we can detect if a
5126                      * pattern creates a possible infinite loop. When we
5127                      * recurse down a level we copy the previous levels bitmap
5128                      * down. When we are at recursion level 0 we zero the top
5129                      * level bitmap. It would be nice to implement a different
5130                      * more efficient way of doing this. In particular the top
5131                      * level bitmap may be unnecessary.
5132                      */
5133                     if (!recursed_depth) {
5134                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5135                     } else {
5136                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5137                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5138                              RExC_study_chunk_recursed_bytes, U8);
5139                     }
5140                     /* we havent recursed into this paren yet, so recurse into it */
5141                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5142                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5143                     my_recursed_depth= recursed_depth + 1;
5144                 } else {
5145                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5146                     /* some form of infinite recursion, assume infinite length
5147                      * */
5148                     if (flags & SCF_DO_SUBSTR) {
5149                         scan_commit(pRExC_state, data, minlenp, is_inf);
5150                         data->cur_is_floating = 1;
5151                     }
5152                     is_inf = is_inf_internal = 1;
5153                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5154                         ssc_anything(data->start_class);
5155                     flags &= ~SCF_DO_STCLASS;
5156
5157                     start= NULL; /* reset start so we dont recurse later on. */
5158                 }
5159             } else {
5160                 paren = stopparen;
5161                 start = scan + 2;
5162                 end = regnext(scan);
5163             }
5164             if (start) {
5165                 scan_frame *newframe;
5166                 assert(end);
5167                 if (!RExC_frame_last) {
5168                     Newxz(newframe, 1, scan_frame);
5169                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5170                     RExC_frame_head= newframe;
5171                     RExC_frame_count++;
5172                 } else if (!RExC_frame_last->next_frame) {
5173                     Newxz(newframe, 1, scan_frame);
5174                     RExC_frame_last->next_frame= newframe;
5175                     newframe->prev_frame= RExC_frame_last;
5176                     RExC_frame_count++;
5177                 } else {
5178                     newframe= RExC_frame_last->next_frame;
5179                 }
5180                 RExC_frame_last= newframe;
5181
5182                 newframe->next_regnode = regnext(scan);
5183                 newframe->last_regnode = last;
5184                 newframe->stopparen = stopparen;
5185                 newframe->prev_recursed_depth = recursed_depth;
5186                 newframe->this_prev_frame= frame;
5187
5188                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5189                 DEBUG_PEEP("fnew", scan, depth, flags);
5190
5191                 frame = newframe;
5192                 scan =  start;
5193                 stopparen = paren;
5194                 last = end;
5195                 depth = depth + 1;
5196                 recursed_depth= my_recursed_depth;
5197
5198                 continue;
5199             }
5200         }
5201         else if (   OP(scan) == EXACT
5202                  || OP(scan) == LEXACT
5203                  || OP(scan) == EXACT_REQ8
5204                  || OP(scan) == LEXACT_REQ8
5205                  || OP(scan) == EXACTL)
5206         {
5207             SSize_t l = STR_LEN(scan);
5208             UV uc;
5209             assert(l);
5210             if (UTF) {
5211                 const U8 * const s = (U8*)STRING(scan);
5212                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5213                 l = utf8_length(s, s + l);
5214             } else {
5215                 uc = *((U8*)STRING(scan));
5216             }
5217             min += l;
5218             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5219                 /* The code below prefers earlier match for fixed
5220                    offset, later match for variable offset.  */
5221                 if (data->last_end == -1) { /* Update the start info. */
5222                     data->last_start_min = data->pos_min;
5223                     data->last_start_max = is_inf
5224                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
5225                 }
5226                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5227                 if (UTF)
5228                     SvUTF8_on(data->last_found);
5229                 {
5230                     SV * const sv = data->last_found;
5231                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5232                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5233                     if (mg && mg->mg_len >= 0)
5234                         mg->mg_len += utf8_length((U8*)STRING(scan),
5235                                               (U8*)STRING(scan)+STR_LEN(scan));
5236                 }
5237                 data->last_end = data->pos_min + l;
5238                 data->pos_min += l; /* As in the first entry. */
5239                 data->flags &= ~SF_BEFORE_EOL;
5240             }
5241
5242             /* ANDing the code point leaves at most it, and not in locale, and
5243              * can't match null string */
5244             if (flags & SCF_DO_STCLASS_AND) {
5245                 ssc_cp_and(data->start_class, uc);
5246                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5247                 ssc_clear_locale(data->start_class);
5248             }
5249             else if (flags & SCF_DO_STCLASS_OR) {
5250                 ssc_add_cp(data->start_class, uc);
5251                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5252
5253                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5254                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5255             }
5256             flags &= ~SCF_DO_STCLASS;
5257         }
5258         else if (PL_regkind[OP(scan)] == EXACT) {
5259             /* But OP != EXACT!, so is EXACTFish */
5260             SSize_t l = STR_LEN(scan);
5261             const U8 * s = (U8*)STRING(scan);
5262
5263             /* Search for fixed substrings supports EXACT only. */
5264             if (flags & SCF_DO_SUBSTR) {
5265                 assert(data);
5266                 scan_commit(pRExC_state, data, minlenp, is_inf);
5267             }
5268             if (UTF) {
5269                 l = utf8_length(s, s + l);
5270             }
5271             if (unfolded_multi_char) {
5272                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5273             }
5274             min += l - min_subtract;
5275             assert (min >= 0);
5276             delta += min_subtract;
5277             if (flags & SCF_DO_SUBSTR) {
5278                 data->pos_min += l - min_subtract;
5279                 if (data->pos_min < 0) {
5280                     data->pos_min = 0;
5281                 }
5282                 data->pos_delta += min_subtract;
5283                 if (min_subtract) {
5284                     data->cur_is_floating = 1; /* float */
5285                 }
5286             }
5287
5288             if (flags & SCF_DO_STCLASS) {
5289                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5290
5291                 assert(EXACTF_invlist);
5292                 if (flags & SCF_DO_STCLASS_AND) {
5293                     if (OP(scan) != EXACTFL)
5294                         ssc_clear_locale(data->start_class);
5295                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5296                     ANYOF_POSIXL_ZERO(data->start_class);
5297                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5298                 }
5299                 else {  /* SCF_DO_STCLASS_OR */
5300                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5301                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5302
5303                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5304                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5305                 }
5306                 flags &= ~SCF_DO_STCLASS;
5307                 SvREFCNT_dec(EXACTF_invlist);
5308             }
5309         }
5310         else if (REGNODE_VARIES(OP(scan))) {
5311             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5312             I32 fl = 0, f = flags;
5313             regnode * const oscan = scan;
5314             regnode_ssc this_class;
5315             regnode_ssc *oclass = NULL;
5316             I32 next_is_eval = 0;
5317
5318             switch (PL_regkind[OP(scan)]) {
5319             case WHILEM:                /* End of (?:...)* . */
5320                 scan = NEXTOPER(scan);
5321                 goto finish;
5322             case PLUS:
5323                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5324                     next = NEXTOPER(scan);
5325                     if (   OP(next) == EXACT
5326                         || OP(next) == LEXACT
5327                         || OP(next) == EXACT_REQ8
5328                         || OP(next) == LEXACT_REQ8
5329                         || OP(next) == EXACTL
5330                         || (flags & SCF_DO_STCLASS))
5331                     {
5332                         mincount = 1;
5333                         maxcount = REG_INFTY;
5334                         next = regnext(scan);
5335                         scan = NEXTOPER(scan);
5336                         goto do_curly;
5337                     }
5338                 }
5339                 if (flags & SCF_DO_SUBSTR)
5340                     data->pos_min++;
5341                 min++;
5342                 /* FALLTHROUGH */
5343             case STAR:
5344                 next = NEXTOPER(scan);
5345
5346                 /* This temporary node can now be turned into EXACTFU, and
5347                  * must, as regexec.c doesn't handle it */
5348                 if (OP(next) == EXACTFU_S_EDGE) {
5349                     OP(next) = EXACTFU;
5350                 }
5351
5352                 if (     STR_LEN(next) == 1
5353                     &&   isALPHA_A(* STRING(next))
5354                     && (         OP(next) == EXACTFAA
5355                         || (     OP(next) == EXACTFU
5356                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
5357                 {
5358                     /* These differ in just one bit */
5359                     U8 mask = ~ ('A' ^ 'a');
5360
5361                     assert(isALPHA_A(* STRING(next)));
5362
5363                     /* Then replace it by an ANYOFM node, with
5364                     * the mask set to the complement of the
5365                     * bit that differs between upper and lower
5366                     * case, and the lowest code point of the
5367                     * pair (which the '&' forces) */
5368                     OP(next) = ANYOFM;
5369                     ARG_SET(next, *STRING(next) & mask);
5370                     FLAGS(next) = mask;
5371                 }
5372
5373                 if (flags & SCF_DO_STCLASS) {
5374                     mincount = 0;
5375                     maxcount = REG_INFTY;
5376                     next = regnext(scan);
5377                     scan = NEXTOPER(scan);
5378                     goto do_curly;
5379                 }
5380                 if (flags & SCF_DO_SUBSTR) {
5381                     scan_commit(pRExC_state, data, minlenp, is_inf);
5382                     /* Cannot extend fixed substrings */
5383                     data->cur_is_floating = 1; /* float */
5384                 }
5385                 is_inf = is_inf_internal = 1;
5386                 scan = regnext(scan);
5387                 goto optimize_curly_tail;
5388             case CURLY:
5389                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5390                     && (scan->flags == stopparen))
5391                 {
5392                     mincount = 1;
5393                     maxcount = 1;
5394                 } else {
5395                     mincount = ARG1(scan);
5396                     maxcount = ARG2(scan);
5397                 }
5398                 next = regnext(scan);
5399                 if (OP(scan) == CURLYX) {
5400                     I32 lp = (data ? *(data->last_closep) : 0);
5401                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5402                 }
5403                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5404                 next_is_eval = (OP(scan) == EVAL);
5405               do_curly:
5406                 if (flags & SCF_DO_SUBSTR) {
5407                     if (mincount == 0)
5408                         scan_commit(pRExC_state, data, minlenp, is_inf);
5409                     /* Cannot extend fixed substrings */
5410                     pos_before = data->pos_min;
5411                 }
5412                 if (data) {
5413                     fl = data->flags;
5414                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5415                     if (is_inf)
5416                         data->flags |= SF_IS_INF;
5417                 }
5418                 if (flags & SCF_DO_STCLASS) {
5419                     ssc_init(pRExC_state, &this_class);
5420                     oclass = data->start_class;
5421                     data->start_class = &this_class;
5422                     f |= SCF_DO_STCLASS_AND;
5423                     f &= ~SCF_DO_STCLASS_OR;
5424                 }
5425                 /* Exclude from super-linear cache processing any {n,m}
5426                    regops for which the combination of input pos and regex
5427                    pos is not enough information to determine if a match
5428                    will be possible.
5429
5430                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5431                    regex pos at the \s*, the prospects for a match depend not
5432                    only on the input position but also on how many (bar\s*)
5433                    repeats into the {4,8} we are. */
5434                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5435                     f &= ~SCF_WHILEM_VISITED_POS;
5436
5437                 /* This will finish on WHILEM, setting scan, or on NULL: */
5438                 /* recurse study_chunk() on loop bodies */
5439                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5440                                   last, data, stopparen, recursed_depth, NULL,
5441                                   (mincount == 0
5442                                    ? (f & ~SCF_DO_SUBSTR)
5443                                    : f)
5444                                   ,depth+1);
5445
5446                 if (flags & SCF_DO_STCLASS)
5447                     data->start_class = oclass;
5448                 if (mincount == 0 || minnext == 0) {
5449                     if (flags & SCF_DO_STCLASS_OR) {
5450                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5451                     }
5452                     else if (flags & SCF_DO_STCLASS_AND) {
5453                         /* Switch to OR mode: cache the old value of
5454                          * data->start_class */
5455                         INIT_AND_WITHP;
5456                         StructCopy(data->start_class, and_withp, regnode_ssc);
5457                         flags &= ~SCF_DO_STCLASS_AND;
5458                         StructCopy(&this_class, data->start_class, regnode_ssc);
5459                         flags |= SCF_DO_STCLASS_OR;
5460                         ANYOF_FLAGS(data->start_class)
5461                                                 |= SSC_MATCHES_EMPTY_STRING;
5462                     }
5463                 } else {                /* Non-zero len */
5464                     if (flags & SCF_DO_STCLASS_OR) {
5465                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5466                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5467                     }
5468                     else if (flags & SCF_DO_STCLASS_AND)
5469                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5470                     flags &= ~SCF_DO_STCLASS;
5471                 }
5472                 if (!scan)              /* It was not CURLYX, but CURLY. */
5473                     scan = next;
5474                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5475                     /* ? quantifier ok, except for (?{ ... }) */
5476                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5477                     && (minnext == 0) && (deltanext == 0)
5478                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5479                     && maxcount <= REG_INFTY/3) /* Complement check for big
5480                                                    count */
5481                 {
5482                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5483                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5484                             "Quantifier unexpected on zero-length expression "
5485                             "in regex m/%" UTF8f "/",
5486                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5487                                   RExC_precomp)));
5488                 }
5489
5490                 min += minnext * mincount;
5491                 is_inf_internal |= deltanext == SSize_t_MAX
5492                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5493                 is_inf |= is_inf_internal;
5494                 if (is_inf) {
5495                     delta = SSize_t_MAX;
5496                 } else {
5497                     delta += (minnext + deltanext) * maxcount
5498                              - minnext * mincount;
5499                 }
5500                 /* Try powerful optimization CURLYX => CURLYN. */
5501                 if (  OP(oscan) == CURLYX && data
5502                       && data->flags & SF_IN_PAR
5503                       && !(data->flags & SF_HAS_EVAL)
5504                       && !deltanext && minnext == 1 ) {
5505                     /* Try to optimize to CURLYN.  */
5506                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5507                     regnode * const nxt1 = nxt;
5508 #ifdef DEBUGGING
5509                     regnode *nxt2;
5510 #endif
5511
5512                     /* Skip open. */
5513                     nxt = regnext(nxt);
5514                     if (!REGNODE_SIMPLE(OP(nxt))
5515                         && !(PL_regkind[OP(nxt)] == EXACT
5516                              && STR_LEN(nxt) == 1))
5517                         goto nogo;
5518 #ifdef DEBUGGING
5519                     nxt2 = nxt;
5520 #endif
5521                     nxt = regnext(nxt);
5522                     if (OP(nxt) != CLOSE)
5523                         goto nogo;
5524                     if (RExC_open_parens) {
5525
5526                         /*open->CURLYM*/
5527                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5528
5529                         /*close->while*/
5530                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5531                     }
5532                     /* Now we know that nxt2 is the only contents: */
5533                     oscan->flags = (U8)ARG(nxt);
5534                     OP(oscan) = CURLYN;
5535                     OP(nxt1) = NOTHING; /* was OPEN. */
5536
5537 #ifdef DEBUGGING
5538                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5539                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5540                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5541                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5542                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5543                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5544 #endif
5545                 }
5546               nogo:
5547
5548                 /* Try optimization CURLYX => CURLYM. */
5549                 if (  OP(oscan) == CURLYX && data
5550                       && !(data->flags & SF_HAS_PAR)
5551                       && !(data->flags & SF_HAS_EVAL)
5552                       && !deltanext     /* atom is fixed width */
5553                       && minnext != 0   /* CURLYM can't handle zero width */
5554
5555                          /* Nor characters whose fold at run-time may be
5556                           * multi-character */
5557                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5558                 ) {
5559                     /* XXXX How to optimize if data == 0? */
5560                     /* Optimize to a simpler form.  */
5561                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5562                     regnode *nxt2;
5563
5564                     OP(oscan) = CURLYM;
5565                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5566                             && (OP(nxt2) != WHILEM))
5567                         nxt = nxt2;
5568                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5569                     /* Need to optimize away parenths. */
5570                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5571                         /* Set the parenth number.  */
5572                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5573
5574                         oscan->flags = (U8)ARG(nxt);
5575                         if (RExC_open_parens) {
5576                              /*open->CURLYM*/
5577                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5578
5579                             /*close->NOTHING*/
5580                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5581                                                          + 1;
5582                         }
5583                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5584                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5585
5586 #ifdef DEBUGGING
5587                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5588                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5589                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5590                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5591 #endif
5592 #if 0
5593                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5594                             regnode *nnxt = regnext(nxt1);
5595                             if (nnxt == nxt) {
5596                                 if (reg_off_by_arg[OP(nxt1)])
5597                                     ARG_SET(nxt1, nxt2 - nxt1);
5598                                 else if (nxt2 - nxt1 < U16_MAX)
5599                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5600                                 else
5601                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5602                             }
5603                             nxt1 = nnxt;
5604                         }
5605 #endif
5606                         /* Optimize again: */
5607                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5608                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5609                                     NULL, stopparen, recursed_depth, NULL, 0,
5610                                     depth+1);
5611                     }
5612                     else
5613                         oscan->flags = 0;
5614                 }
5615                 else if ((OP(oscan) == CURLYX)
5616                          && (flags & SCF_WHILEM_VISITED_POS)
5617                          /* See the comment on a similar expression above.
5618                             However, this time it's not a subexpression
5619                             we care about, but the expression itself. */
5620                          && (maxcount == REG_INFTY)
5621                          && data) {
5622                     /* This stays as CURLYX, we can put the count/of pair. */
5623                     /* Find WHILEM (as in regexec.c) */
5624                     regnode *nxt = oscan + NEXT_OFF(oscan);
5625
5626                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5627                         nxt += ARG(nxt);
5628                     nxt = PREVOPER(nxt);
5629                     if (nxt->flags & 0xf) {
5630                         /* we've already set whilem count on this node */
5631                     } else if (++data->whilem_c < 16) {
5632                         assert(data->whilem_c <= RExC_whilem_seen);
5633                         nxt->flags = (U8)(data->whilem_c
5634                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5635                     }
5636                 }
5637                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5638                     pars++;
5639                 if (flags & SCF_DO_SUBSTR) {
5640                     SV *last_str = NULL;
5641                     STRLEN last_chrs = 0;
5642                     int counted = mincount != 0;
5643
5644                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5645                                                                   string. */
5646                         SSize_t b = pos_before >= data->last_start_min
5647                             ? pos_before : data->last_start_min;
5648                         STRLEN l;
5649                         const char * const s = SvPV_const(data->last_found, l);
5650                         SSize_t old = b - data->last_start_min;
5651                         assert(old >= 0);
5652
5653                         if (UTF)
5654                             old = utf8_hop_forward((U8*)s, old,
5655                                                (U8 *) SvEND(data->last_found))
5656                                 - (U8*)s;
5657                         l -= old;
5658                         /* Get the added string: */
5659                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5660                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5661                                             (U8*)(s + old + l)) : l;
5662                         if (deltanext == 0 && pos_before == b) {
5663                             /* What was added is a constant string */
5664                             if (mincount > 1) {
5665
5666                                 SvGROW(last_str, (mincount * l) + 1);
5667                                 repeatcpy(SvPVX(last_str) + l,
5668                                           SvPVX_const(last_str), l,
5669                                           mincount - 1);
5670                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5671                                 /* Add additional parts. */
5672                                 SvCUR_set(data->last_found,
5673                                           SvCUR(data->last_found) - l);
5674                                 sv_catsv(data->last_found, last_str);
5675                                 {
5676                                     SV * sv = data->last_found;
5677                                     MAGIC *mg =
5678                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5679                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5680                                     if (mg && mg->mg_len >= 0)
5681                                         mg->mg_len += last_chrs * (mincount-1);
5682                                 }
5683                                 last_chrs *= mincount;
5684                                 data->last_end += l * (mincount - 1);
5685                             }
5686                         } else {
5687                             /* start offset must point into the last copy */
5688                             data->last_start_min += minnext * (mincount - 1);
5689                             data->last_start_max =
5690                               is_inf
5691                                ? SSize_t_MAX
5692                                : data->last_start_max +
5693                                  (maxcount - 1) * (minnext + data->pos_delta);
5694                         }
5695                     }
5696                     /* It is counted once already... */
5697                     data->pos_min += minnext * (mincount - counted);
5698 #if 0
5699 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5700                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5701                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5702     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5703     (UV)mincount);
5704 if (deltanext != SSize_t_MAX)
5705 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5706     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5707           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5708 #endif
5709                     if (deltanext == SSize_t_MAX
5710                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5711                         data->pos_delta = SSize_t_MAX;
5712                     else
5713                         data->pos_delta += - counted * deltanext +
5714                         (minnext + deltanext) * maxcount - minnext * mincount;
5715                     if (mincount != maxcount) {
5716                          /* Cannot extend fixed substrings found inside
5717                             the group.  */
5718                         scan_commit(pRExC_state, data, minlenp, is_inf);
5719                         if (mincount && last_str) {
5720                             SV * const sv = data->last_found;
5721                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5722                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5723
5724                             if (mg)
5725                                 mg->mg_len = -1;
5726                             sv_setsv(sv, last_str);
5727                             data->last_end = data->pos_min;
5728                             data->last_start_min = data->pos_min - last_chrs;
5729                             data->last_start_max = is_inf
5730                                 ? SSize_t_MAX
5731                                 : data->pos_min + data->pos_delta - last_chrs;
5732                         }
5733                         data->cur_is_floating = 1; /* float */
5734                     }
5735                     SvREFCNT_dec(last_str);
5736                 }
5737                 if (data && (fl & SF_HAS_EVAL))
5738                     data->flags |= SF_HAS_EVAL;
5739               optimize_curly_tail:
5740                 if (OP(oscan) != CURLYX) {
5741                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5742                            && NEXT_OFF(next))
5743                         NEXT_OFF(oscan) += NEXT_OFF(next);
5744                 }
5745                 continue;
5746
5747             default:
5748 #ifdef DEBUGGING
5749                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5750                                                                     OP(scan));
5751 #endif
5752             case REF:
5753             case CLUMP:
5754                 if (flags & SCF_DO_SUBSTR) {
5755                     /* Cannot expect anything... */
5756                     scan_commit(pRExC_state, data, minlenp, is_inf);
5757                     data->cur_is_floating = 1; /* float */
5758                 }
5759                 is_inf = is_inf_internal = 1;
5760                 if (flags & SCF_DO_STCLASS_OR) {
5761                     if (OP(scan) == CLUMP) {
5762                         /* Actually is any start char, but very few code points
5763                          * aren't start characters */
5764                         ssc_match_all_cp(data->start_class);
5765                     }
5766                     else {
5767                         ssc_anything(data->start_class);
5768                     }
5769                 }
5770                 flags &= ~SCF_DO_STCLASS;
5771                 break;
5772             }
5773         }
5774         else if (OP(scan) == LNBREAK) {
5775             if (flags & SCF_DO_STCLASS) {
5776                 if (flags & SCF_DO_STCLASS_AND) {
5777                     ssc_intersection(data->start_class,
5778                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5779                     ssc_clear_locale(data->start_class);
5780                     ANYOF_FLAGS(data->start_class)
5781                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5782                 }
5783                 else if (flags & SCF_DO_STCLASS_OR) {
5784                     ssc_union(data->start_class,
5785                               PL_XPosix_ptrs[_CC_VERTSPACE],
5786                               FALSE);
5787                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5788
5789                     /* See commit msg for
5790                      * 749e076fceedeb708a624933726e7989f2302f6a */
5791                     ANYOF_FLAGS(data->start_class)
5792                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5793                 }
5794                 flags &= ~SCF_DO_STCLASS;
5795             }
5796             min++;
5797             if (delta != SSize_t_MAX)
5798                 delta++;    /* Because of the 2 char string cr-lf */
5799             if (flags & SCF_DO_SUBSTR) {
5800                 /* Cannot expect anything... */
5801                 scan_commit(pRExC_state, data, minlenp, is_inf);
5802                 data->pos_min += 1;
5803                 if (data->pos_delta != SSize_t_MAX) {
5804                     data->pos_delta += 1;
5805                 }
5806                 data->cur_is_floating = 1; /* float */
5807             }
5808         }
5809         else if (REGNODE_SIMPLE(OP(scan))) {
5810
5811             if (flags & SCF_DO_SUBSTR) {
5812                 scan_commit(pRExC_state, data, minlenp, is_inf);
5813                 data->pos_min++;
5814             }
5815             min++;
5816             if (flags & SCF_DO_STCLASS) {
5817                 bool invert = 0;
5818                 SV* my_invlist = NULL;
5819                 U8 namedclass;
5820
5821                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5822                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5823
5824                 /* Some of the logic below assumes that switching
5825                    locale on will only add false positives. */
5826                 switch (OP(scan)) {
5827
5828                 default:
5829 #ifdef DEBUGGING
5830                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5831                                                                      OP(scan));
5832 #endif
5833                 case SANY:
5834                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5835                         ssc_match_all_cp(data->start_class);
5836                     break;
5837
5838                 case REG_ANY:
5839                     {
5840                         SV* REG_ANY_invlist = _new_invlist(2);
5841                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5842                                                             '\n');
5843                         if (flags & SCF_DO_STCLASS_OR) {
5844                             ssc_union(data->start_class,
5845                                       REG_ANY_invlist,
5846                                       TRUE /* TRUE => invert, hence all but \n
5847                                             */
5848                                       );
5849                         }
5850                         else if (flags & SCF_DO_STCLASS_AND) {
5851                             ssc_intersection(data->start_class,
5852                                              REG_ANY_invlist,
5853                                              TRUE  /* TRUE => invert */
5854                                              );
5855                             ssc_clear_locale(data->start_class);
5856                         }
5857                         SvREFCNT_dec_NN(REG_ANY_invlist);
5858                     }
5859                     break;
5860
5861                 case ANYOFD:
5862                 case ANYOFL:
5863                 case ANYOFPOSIXL:
5864                 case ANYOFH:
5865                 case ANYOFHb:
5866                 case ANYOFHr:
5867                 case ANYOFHs:
5868                 case ANYOF:
5869                     if (flags & SCF_DO_STCLASS_AND)
5870                         ssc_and(pRExC_state, data->start_class,
5871                                 (regnode_charclass *) scan);
5872                     else
5873                         ssc_or(pRExC_state, data->start_class,
5874                                                           (regnode_charclass *) scan);
5875                     break;
5876
5877                 case NANYOFM:
5878                 case ANYOFM:
5879                   {
5880                     SV* cp_list = get_ANYOFM_contents(scan);
5881
5882                     if (flags & SCF_DO_STCLASS_OR) {
5883                         ssc_union(data->start_class, cp_list, invert);
5884                     }
5885                     else if (flags & SCF_DO_STCLASS_AND) {
5886                         ssc_intersection(data->start_class, cp_list, invert);
5887                     }
5888
5889                     SvREFCNT_dec_NN(cp_list);
5890                     break;
5891                   }
5892
5893                 case ANYOFR:
5894                 case ANYOFRb:
5895                   {
5896                     SV* cp_list = NULL;
5897
5898                     cp_list = _add_range_to_invlist(cp_list,
5899                                         ANYOFRbase(scan),
5900                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
5901
5902                     if (flags & SCF_DO_STCLASS_OR) {
5903                         ssc_union(data->start_class, cp_list, invert);
5904                     }
5905                     else if (flags & SCF_DO_STCLASS_AND) {
5906                         ssc_intersection(data->start_class, cp_list, invert);
5907                     }
5908
5909                     SvREFCNT_dec_NN(cp_list);
5910                     break;
5911                   }
5912
5913                 case NPOSIXL:
5914                     invert = 1;
5915                     /* FALLTHROUGH */
5916
5917                 case POSIXL:
5918                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5919                     if (flags & SCF_DO_STCLASS_AND) {
5920                         bool was_there = cBOOL(
5921                                           ANYOF_POSIXL_TEST(data->start_class,
5922                                                                  namedclass));
5923                         ANYOF_POSIXL_ZERO(data->start_class);
5924                         if (was_there) {    /* Do an AND */
5925                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5926                         }
5927                         /* No individual code points can now match */
5928                         data->start_class->invlist
5929                                                 = sv_2mortal(_new_invlist(0));
5930                     }
5931                     else {
5932                         int complement = namedclass + ((invert) ? -1 : 1);
5933
5934                         assert(flags & SCF_DO_STCLASS_OR);
5935
5936                         /* If the complement of this class was already there,
5937                          * the result is that they match all code points,
5938                          * (\d + \D == everything).  Remove the classes from
5939                          * future consideration.  Locale is not relevant in
5940                          * this case */
5941                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5942                             ssc_match_all_cp(data->start_class);
5943                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5944                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5945                         }
5946                         else {  /* The usual case; just add this class to the
5947                                    existing set */
5948                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5949                         }
5950                     }
5951                     break;
5952
5953                 case NPOSIXA:   /* For these, we always know the exact set of
5954                                    what's matched */
5955                     invert = 1;
5956                     /* FALLTHROUGH */
5957                 case POSIXA:
5958                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5959                     goto join_posix_and_ascii;
5960
5961                 case NPOSIXD:
5962                 case NPOSIXU:
5963                     invert = 1;
5964                     /* FALLTHROUGH */
5965                 case POSIXD:
5966                 case POSIXU:
5967                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5968
5969                     /* NPOSIXD matches all upper Latin1 code points unless the
5970                      * target string being matched is UTF-8, which is
5971                      * unknowable until match time.  Since we are going to
5972                      * invert, we want to get rid of all of them so that the
5973                      * inversion will match all */
5974                     if (OP(scan) == NPOSIXD) {
5975                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5976                                           &my_invlist);
5977                     }
5978
5979                   join_posix_and_ascii:
5980
5981                     if (flags & SCF_DO_STCLASS_AND) {
5982                         ssc_intersection(data->start_class, my_invlist, invert);
5983                         ssc_clear_locale(data->start_class);
5984                     }
5985                     else {
5986                         assert(flags & SCF_DO_STCLASS_OR);
5987                         ssc_union(data->start_class, my_invlist, invert);
5988                     }
5989                     SvREFCNT_dec(my_invlist);
5990                 }
5991                 if (flags & SCF_DO_STCLASS_OR)
5992                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5993                 flags &= ~SCF_DO_STCLASS;
5994             }
5995         }
5996         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5997             data->flags |= (OP(scan) == MEOL
5998                             ? SF_BEFORE_MEOL
5999                             : SF_BEFORE_SEOL);
6000             scan_commit(pRExC_state, data, minlenp, is_inf);
6001
6002         }
6003         else if (  PL_regkind[OP(scan)] == BRANCHJ
6004                  /* Lookbehind, or need to calculate parens/evals/stclass: */
6005                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
6006                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6007         {
6008             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6009                 || OP(scan) == UNLESSM )
6010             {
6011                 /* Negative Lookahead/lookbehind
6012                    In this case we can't do fixed string optimisation.
6013                 */
6014
6015                 SSize_t deltanext, minnext, fake = 0;
6016                 regnode *nscan;
6017                 regnode_ssc intrnl;
6018                 int f = 0;
6019
6020                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6021                 if (data) {
6022                     data_fake.whilem_c = data->whilem_c;
6023                     data_fake.last_closep = data->last_closep;
6024                 }
6025                 else
6026                     data_fake.last_closep = &fake;
6027                 data_fake.pos_delta = delta;
6028                 if ( flags & SCF_DO_STCLASS && !scan->flags
6029                      && OP(scan) == IFMATCH ) { /* Lookahead */
6030                     ssc_init(pRExC_state, &intrnl);
6031                     data_fake.start_class = &intrnl;
6032                     f |= SCF_DO_STCLASS_AND;
6033                 }
6034                 if (flags & SCF_WHILEM_VISITED_POS)
6035                     f |= SCF_WHILEM_VISITED_POS;
6036                 next = regnext(scan);
6037                 nscan = NEXTOPER(NEXTOPER(scan));
6038
6039                 /* recurse study_chunk() for lookahead body */
6040                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6041                                       last, &data_fake, stopparen,
6042                                       recursed_depth, NULL, f, depth+1);
6043                 if (scan->flags) {
6044                     if (   deltanext < 0
6045                         || deltanext > (I32) U8_MAX
6046                         || minnext > (I32)U8_MAX
6047                         || minnext + deltanext > (I32)U8_MAX)
6048                     {
6049                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6050                               (UV)U8_MAX);
6051                     }
6052
6053                     /* The 'next_off' field has been repurposed to count the
6054                      * additional starting positions to try beyond the initial
6055                      * one.  (This leaves it at 0 for non-variable length
6056                      * matches to avoid breakage for those not using this
6057                      * extension) */
6058                     if (deltanext) {
6059                         scan->next_off = deltanext;
6060                         ckWARNexperimental(RExC_parse,
6061                             WARN_EXPERIMENTAL__VLB,
6062                             "Variable length lookbehind is experimental");
6063                     }
6064                     scan->flags = (U8)minnext + deltanext;
6065                 }
6066                 if (data) {
6067                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6068                         pars++;
6069                     if (data_fake.flags & SF_HAS_EVAL)
6070                         data->flags |= SF_HAS_EVAL;
6071                     data->whilem_c = data_fake.whilem_c;
6072                 }
6073                 if (f & SCF_DO_STCLASS_AND) {
6074                     if (flags & SCF_DO_STCLASS_OR) {
6075                         /* OR before, AND after: ideally we would recurse with
6076                          * data_fake to get the AND applied by study of the
6077                          * remainder of the pattern, and then derecurse;
6078                          * *** HACK *** for now just treat as "no information".
6079                          * See [perl #56690].
6080                          */
6081                         ssc_init(pRExC_state, data->start_class);
6082                     }  else {
6083                         /* AND before and after: combine and continue.  These
6084                          * assertions are zero-length, so can match an EMPTY
6085                          * string */
6086                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6087                         ANYOF_FLAGS(data->start_class)
6088                                                    |= SSC_MATCHES_EMPTY_STRING;
6089                     }
6090                 }
6091             }
6092 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6093             else {
6094                 /* Positive Lookahead/lookbehind
6095                    In this case we can do fixed string optimisation,
6096                    but we must be careful about it. Note in the case of
6097                    lookbehind the positions will be offset by the minimum
6098                    length of the pattern, something we won't know about
6099                    until after the recurse.
6100                 */
6101                 SSize_t deltanext, fake = 0;
6102                 regnode *nscan;
6103                 regnode_ssc intrnl;
6104                 int f = 0;
6105                 /* We use SAVEFREEPV so that when the full compile
6106                     is finished perl will clean up the allocated
6107                     minlens when it's all done. This way we don't
6108                     have to worry about freeing them when we know
6109                     they wont be used, which would be a pain.
6110                  */
6111                 SSize_t *minnextp;
6112                 Newx( minnextp, 1, SSize_t );
6113                 SAVEFREEPV(minnextp);
6114
6115                 if (data) {
6116                     StructCopy(data, &data_fake, scan_data_t);
6117                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6118                         f |= SCF_DO_SUBSTR;
6119                         if (scan->flags)
6120                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6121                         data_fake.last_found=newSVsv(data->last_found);
6122                     }
6123                 }
6124                 else
6125                     data_fake.last_closep = &fake;
6126                 data_fake.flags = 0;
6127                 data_fake.substrs[0].flags = 0;
6128                 data_fake.substrs[1].flags = 0;
6129                 data_fake.pos_delta = delta;
6130                 if (is_inf)
6131                     data_fake.flags |= SF_IS_INF;
6132                 if ( flags & SCF_DO_STCLASS && !scan->flags
6133                      && OP(scan) == IFMATCH ) { /* Lookahead */
6134                     ssc_init(pRExC_state, &intrnl);
6135                     data_fake.start_class = &intrnl;
6136                     f |= SCF_DO_STCLASS_AND;
6137                 }
6138                 if (flags & SCF_WHILEM_VISITED_POS)
6139                     f |= SCF_WHILEM_VISITED_POS;
6140                 next = regnext(scan);
6141                 nscan = NEXTOPER(NEXTOPER(scan));
6142
6143                 /* positive lookahead study_chunk() recursion */
6144                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6145                                         &deltanext, last, &data_fake,
6146                                         stopparen, recursed_depth, NULL,
6147                                         f, depth+1);
6148                 if (scan->flags) {
6149                     assert(0);  /* This code has never been tested since this
6150                                    is normally not compiled */
6151                     if (   deltanext < 0
6152                         || deltanext > (I32) U8_MAX
6153                         || *minnextp > (I32)U8_MAX
6154                         || *minnextp + deltanext > (I32)U8_MAX)
6155                     {
6156                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6157                               (UV)U8_MAX);
6158                     }
6159
6160                     if (deltanext) {
6161                         scan->next_off = deltanext;
6162                     }
6163                     scan->flags = (U8)*minnextp + deltanext;
6164                 }
6165
6166                 *minnextp += min;
6167
6168                 if (f & SCF_DO_STCLASS_AND) {
6169                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6170                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6171                 }
6172                 if (data) {
6173                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6174                         pars++;
6175                     if (data_fake.flags & SF_HAS_EVAL)
6176                         data->flags |= SF_HAS_EVAL;
6177                     data->whilem_c = data_fake.whilem_c;
6178                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6179                         int i;
6180                         if (RExC_rx->minlen<*minnextp)
6181                             RExC_rx->minlen=*minnextp;
6182                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6183                         SvREFCNT_dec_NN(data_fake.last_found);
6184
6185                         for (i = 0; i < 2; i++) {
6186                             if (data_fake.substrs[i].minlenp != minlenp) {
6187                                 data->substrs[i].min_offset =
6188                                             data_fake.substrs[i].min_offset;
6189                                 data->substrs[i].max_offset =
6190                                             data_fake.substrs[i].max_offset;
6191                                 data->substrs[i].minlenp =
6192                                             data_fake.substrs[i].minlenp;
6193                                 data->substrs[i].lookbehind += scan->flags;
6194                             }
6195                         }
6196                     }
6197                 }
6198             }
6199 #endif
6200         }
6201         else if (OP(scan) == OPEN) {
6202             if (stopparen != (I32)ARG(scan))
6203                 pars++;
6204         }
6205         else if (OP(scan) == CLOSE) {
6206             if (stopparen == (I32)ARG(scan)) {
6207                 break;
6208             }
6209             if ((I32)ARG(scan) == is_par) {
6210                 next = regnext(scan);
6211
6212                 if ( next && (OP(next) != WHILEM) && next < last)
6213                     is_par = 0;         /* Disable optimization */
6214             }
6215             if (data)
6216                 *(data->last_closep) = ARG(scan);
6217         }
6218         else if (OP(scan) == EVAL) {
6219                 if (data)
6220                     data->flags |= SF_HAS_EVAL;
6221         }
6222         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6223             if (flags & SCF_DO_SUBSTR) {
6224                 scan_commit(pRExC_state, data, minlenp, is_inf);
6225                 flags &= ~SCF_DO_SUBSTR;
6226             }
6227             if (data && OP(scan)==ACCEPT) {
6228                 data->flags |= SCF_SEEN_ACCEPT;
6229                 if (stopmin > min)
6230                     stopmin = min;
6231             }
6232         }
6233         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6234         {
6235                 if (flags & SCF_DO_SUBSTR) {
6236                     scan_commit(pRExC_state, data, minlenp, is_inf);
6237                     data->cur_is_floating = 1; /* float */
6238                 }
6239                 is_inf = is_inf_internal = 1;
6240                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6241                     ssc_anything(data->start_class);
6242                 flags &= ~SCF_DO_STCLASS;
6243         }
6244         else if (OP(scan) == GPOS) {
6245             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6246                 !(delta || is_inf || (data && data->pos_delta)))
6247             {
6248                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6249                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6250                 if (RExC_rx->gofs < (STRLEN)min)
6251                     RExC_rx->gofs = min;
6252             } else {
6253                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6254                 RExC_rx->gofs = 0;
6255             }
6256         }
6257 #ifdef TRIE_STUDY_OPT
6258 #ifdef FULL_TRIE_STUDY
6259         else if (PL_regkind[OP(scan)] == TRIE) {
6260             /* NOTE - There is similar code to this block above for handling
6261                BRANCH nodes on the initial study.  If you change stuff here
6262                check there too. */
6263             regnode *trie_node= scan;
6264             regnode *tail= regnext(scan);
6265             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6266             SSize_t max1 = 0, min1 = SSize_t_MAX;
6267             regnode_ssc accum;
6268
6269             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6270                 /* Cannot merge strings after this. */
6271                 scan_commit(pRExC_state, data, minlenp, is_inf);
6272             }
6273             if (flags & SCF_DO_STCLASS)
6274                 ssc_init_zero(pRExC_state, &accum);
6275
6276             if (!trie->jump) {
6277                 min1= trie->minlen;
6278                 max1= trie->maxlen;
6279             } else {
6280                 const regnode *nextbranch= NULL;
6281                 U32 word;
6282
6283                 for ( word=1 ; word <= trie->wordcount ; word++)
6284                 {
6285                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6286                     regnode_ssc this_class;
6287
6288                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6289                     if (data) {
6290                         data_fake.whilem_c = data->whilem_c;
6291                         data_fake.last_closep = data->last_closep;
6292                     }
6293                     else
6294                         data_fake.last_closep = &fake;
6295                     data_fake.pos_delta = delta;
6296                     if (flags & SCF_DO_STCLASS) {
6297                         ssc_init(pRExC_state, &this_class);
6298                         data_fake.start_class = &this_class;
6299                         f = SCF_DO_STCLASS_AND;
6300                     }
6301                     if (flags & SCF_WHILEM_VISITED_POS)
6302                         f |= SCF_WHILEM_VISITED_POS;
6303
6304                     if (trie->jump[word]) {
6305                         if (!nextbranch)
6306                             nextbranch = trie_node + trie->jump[0];
6307                         scan= trie_node + trie->jump[word];
6308                         /* We go from the jump point to the branch that follows
6309                            it. Note this means we need the vestigal unused
6310                            branches even though they arent otherwise used. */
6311                         /* optimise study_chunk() for TRIE */
6312                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6313                             &deltanext, (regnode *)nextbranch, &data_fake,
6314                             stopparen, recursed_depth, NULL, f, depth+1);
6315                     }
6316                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6317                         nextbranch= regnext((regnode*)nextbranch);
6318
6319                     if (min1 > (SSize_t)(minnext + trie->minlen))
6320                         min1 = minnext + trie->minlen;
6321                     if (deltanext == SSize_t_MAX) {
6322                         is_inf = is_inf_internal = 1;
6323                         max1 = SSize_t_MAX;
6324                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6325                         max1 = minnext + deltanext + trie->maxlen;
6326
6327                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6328                         pars++;
6329                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6330                         if ( stopmin > min + min1)
6331                             stopmin = min + min1;
6332                         flags &= ~SCF_DO_SUBSTR;
6333                         if (data)
6334                             data->flags |= SCF_SEEN_ACCEPT;
6335                     }
6336                     if (data) {
6337                         if (data_fake.flags & SF_HAS_EVAL)
6338                             data->flags |= SF_HAS_EVAL;
6339                         data->whilem_c = data_fake.whilem_c;
6340                     }
6341                     if (flags & SCF_DO_STCLASS)
6342                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6343                 }
6344             }
6345             if (flags & SCF_DO_SUBSTR) {
6346                 data->pos_min += min1;
6347                 data->pos_delta += max1 - min1;
6348                 if (max1 != min1 || is_inf)
6349                     data->cur_is_floating = 1; /* float */
6350             }
6351             min += min1;
6352             if (delta != SSize_t_MAX) {
6353                 if (SSize_t_MAX - (max1 - min1) >= delta)
6354                     delta += max1 - min1;
6355                 else
6356                     delta = SSize_t_MAX;
6357             }
6358             if (flags & SCF_DO_STCLASS_OR) {
6359                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6360                 if (min1) {
6361                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6362                     flags &= ~SCF_DO_STCLASS;
6363                 }
6364             }
6365             else if (flags & SCF_DO_STCLASS_AND) {
6366                 if (min1) {
6367                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6368                     flags &= ~SCF_DO_STCLASS;
6369                 }
6370                 else {
6371                     /* Switch to OR mode: cache the old value of
6372                      * data->start_class */
6373                     INIT_AND_WITHP;
6374                     StructCopy(data->start_class, and_withp, regnode_ssc);
6375                     flags &= ~SCF_DO_STCLASS_AND;
6376                     StructCopy(&accum, data->start_class, regnode_ssc);
6377                     flags |= SCF_DO_STCLASS_OR;
6378                 }
6379             }
6380             scan= tail;
6381             continue;
6382         }
6383 #else
6384         else if (PL_regkind[OP(scan)] == TRIE) {
6385             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6386             U8*bang=NULL;
6387
6388             min += trie->minlen;
6389             delta += (trie->maxlen - trie->minlen);
6390             flags &= ~SCF_DO_STCLASS; /* xxx */
6391             if (flags & SCF_DO_SUBSTR) {
6392                 /* Cannot expect anything... */
6393                 scan_commit(pRExC_state, data, minlenp, is_inf);
6394                 data->pos_min += trie->minlen;
6395                 data->pos_delta += (trie->maxlen - trie->minlen);
6396                 if (trie->maxlen != trie->minlen)
6397                     data->cur_is_floating = 1; /* float */
6398             }
6399             if (trie->jump) /* no more substrings -- for now /grr*/
6400                flags &= ~SCF_DO_SUBSTR;
6401         }
6402 #endif /* old or new */
6403 #endif /* TRIE_STUDY_OPT */
6404
6405         /* Else: zero-length, ignore. */
6406         scan = regnext(scan);
6407     }
6408
6409   finish:
6410     if (frame) {
6411         /* we need to unwind recursion. */
6412         depth = depth - 1;
6413
6414         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6415         DEBUG_PEEP("fend", scan, depth, flags);
6416
6417         /* restore previous context */
6418         last = frame->last_regnode;
6419         scan = frame->next_regnode;
6420         stopparen = frame->stopparen;
6421         recursed_depth = frame->prev_recursed_depth;
6422
6423         RExC_frame_last = frame->prev_frame;
6424         frame = frame->this_prev_frame;
6425         goto fake_study_recurse;
6426     }
6427
6428     assert(!frame);
6429     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6430
6431     *scanp = scan;
6432     *deltap = is_inf_internal ? SSize_t_MAX : delta;
6433
6434     if (flags & SCF_DO_SUBSTR && is_inf)
6435         data->pos_delta = SSize_t_MAX - data->pos_min;
6436     if (is_par > (I32)U8_MAX)
6437         is_par = 0;
6438     if (is_par && pars==1 && data) {
6439         data->flags |= SF_IN_PAR;
6440         data->flags &= ~SF_HAS_PAR;
6441     }
6442     else if (pars && data) {
6443         data->flags |= SF_HAS_PAR;
6444         data->flags &= ~SF_IN_PAR;
6445     }
6446     if (flags & SCF_DO_STCLASS_OR)
6447         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6448     if (flags & SCF_TRIE_RESTUDY)
6449         data->flags |=  SCF_TRIE_RESTUDY;
6450
6451     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6452
6453     {
6454         SSize_t final_minlen= min < stopmin ? min : stopmin;
6455
6456         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6457             if (final_minlen > SSize_t_MAX - delta)
6458                 RExC_maxlen = SSize_t_MAX;
6459             else if (RExC_maxlen < final_minlen + delta)
6460                 RExC_maxlen = final_minlen + delta;
6461         }
6462         return final_minlen;
6463     }
6464     NOT_REACHED; /* NOTREACHED */
6465 }
6466
6467 STATIC U32
6468 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6469 {
6470     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6471
6472     PERL_ARGS_ASSERT_ADD_DATA;
6473
6474     Renewc(RExC_rxi->data,
6475            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6476            char, struct reg_data);
6477     if(count)
6478         Renew(RExC_rxi->data->what, count + n, U8);
6479     else
6480         Newx(RExC_rxi->data->what, n, U8);
6481     RExC_rxi->data->count = count + n;
6482     Copy(s, RExC_rxi->data->what + count, n, U8);
6483     return count;
6484 }
6485
6486 /*XXX: todo make this not included in a non debugging perl, but appears to be
6487  * used anyway there, in 'use re' */
6488 #ifndef PERL_IN_XSUB_RE
6489 void
6490 Perl_reginitcolors(pTHX)
6491 {
6492     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6493     if (s) {
6494         char *t = savepv(s);
6495         int i = 0;
6496         PL_colors[0] = t;
6497         while (++i < 6) {
6498             t = strchr(t, '\t');
6499             if (t) {
6500                 *t = '\0';
6501                 PL_colors[i] = ++t;
6502             }
6503             else
6504                 PL_colors[i] = t = (char *)"";
6505         }
6506     } else {
6507         int i = 0;
6508         while (i < 6)
6509             PL_colors[i++] = (char *)"";
6510     }
6511     PL_colorset = 1;
6512 }
6513 #endif
6514
6515
6516 #ifdef TRIE_STUDY_OPT
6517 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6518     STMT_START {                                            \
6519         if (                                                \
6520               (data.flags & SCF_TRIE_RESTUDY)               \
6521               && ! restudied++                              \
6522         ) {                                                 \
6523             dOsomething;                                    \
6524             goto reStudy;                                   \
6525         }                                                   \
6526     } STMT_END
6527 #else
6528 #define CHECK_RESTUDY_GOTO_butfirst
6529 #endif
6530
6531 /*
6532  * pregcomp - compile a regular expression into internal code
6533  *
6534  * Decides which engine's compiler to call based on the hint currently in
6535  * scope
6536  */
6537
6538 #ifndef PERL_IN_XSUB_RE
6539
6540 /* return the currently in-scope regex engine (or the default if none)  */
6541
6542 regexp_engine const *
6543 Perl_current_re_engine(pTHX)
6544 {
6545     if (IN_PERL_COMPILETIME) {
6546         HV * const table = GvHV(PL_hintgv);
6547         SV **ptr;
6548
6549         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6550             return &PL_core_reg_engine;
6551         ptr = hv_fetchs(table, "regcomp", FALSE);
6552         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6553             return &PL_core_reg_engine;
6554         return INT2PTR(regexp_engine*, SvIV(*ptr));
6555     }
6556     else {
6557         SV *ptr;
6558         if (!PL_curcop->cop_hints_hash)
6559             return &PL_core_reg_engine;
6560         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6561         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6562             return &PL_core_reg_engine;
6563         return INT2PTR(regexp_engine*, SvIV(ptr));
6564     }
6565 }
6566
6567
6568 REGEXP *
6569 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6570 {
6571     regexp_engine const *eng = current_re_engine();
6572     GET_RE_DEBUG_FLAGS_DECL;
6573
6574     PERL_ARGS_ASSERT_PREGCOMP;
6575
6576     /* Dispatch a request to compile a regexp to correct regexp engine. */
6577     DEBUG_COMPILE_r({
6578         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6579                         PTR2UV(eng));
6580     });
6581     return CALLREGCOMP_ENG(eng, pattern, flags);
6582 }
6583 #endif
6584
6585 /* public(ish) entry point for the perl core's own regex compiling code.
6586  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6587  * pattern rather than a list of OPs, and uses the internal engine rather
6588  * than the current one */
6589
6590 REGEXP *
6591 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6592 {
6593     SV *pat = pattern; /* defeat constness! */
6594     PERL_ARGS_ASSERT_RE_COMPILE;
6595     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6596 #ifdef PERL_IN_XSUB_RE
6597                                 &my_reg_engine,
6598 #else
6599                                 &PL_core_reg_engine,
6600 #endif
6601                                 NULL, NULL, rx_flags, 0);
6602 }
6603
6604
6605 static void
6606 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6607 {
6608     int n;
6609
6610     if (--cbs->refcnt > 0)
6611         return;
6612     for (n = 0; n < cbs->count; n++) {
6613         REGEXP *rx = cbs->cb[n].src_regex;
6614         if (rx) {
6615             cbs->cb[n].src_regex = NULL;
6616             SvREFCNT_dec_NN(rx);
6617         }
6618     }
6619     Safefree(cbs->cb);
6620     Safefree(cbs);
6621 }
6622
6623
6624 static struct reg_code_blocks *
6625 S_alloc_code_blocks(pTHX_  int ncode)
6626 {
6627      struct reg_code_blocks *cbs;
6628     Newx(cbs, 1, struct reg_code_blocks);
6629     cbs->count = ncode;
6630     cbs->refcnt = 1;
6631     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6632     if (ncode)
6633         Newx(cbs->cb, ncode, struct reg_code_block);
6634     else
6635         cbs->cb = NULL;
6636     return cbs;
6637 }
6638
6639
6640 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6641  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6642  * point to the realloced string and length.
6643  *
6644  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6645  * stuff added */
6646
6647 static void
6648 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6649                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6650 {
6651     U8 *const src = (U8*)*pat_p;
6652     U8 *dst, *d;
6653     int n=0;
6654     STRLEN s = 0;
6655     bool do_end = 0;
6656     GET_RE_DEBUG_FLAGS_DECL;
6657
6658     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6659         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6660
6661     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6662     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6663     d = dst;
6664
6665     while (s < *plen_p) {
6666         append_utf8_from_native_byte(src[s], &d);
6667
6668         if (n < num_code_blocks) {
6669             assert(pRExC_state->code_blocks);
6670             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6671                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6672                 assert(*(d - 1) == '(');
6673                 do_end = 1;
6674             }
6675             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6676                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6677                 assert(*(d - 1) == ')');
6678                 do_end = 0;
6679                 n++;
6680             }
6681         }
6682         s++;
6683     }
6684     *d = '\0';
6685     *plen_p = d - dst;
6686     *pat_p = (char*) dst;
6687     SAVEFREEPV(*pat_p);
6688     RExC_orig_utf8 = RExC_utf8 = 1;
6689 }
6690
6691
6692
6693 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6694  * while recording any code block indices, and handling overloading,
6695  * nested qr// objects etc.  If pat is null, it will allocate a new
6696  * string, or just return the first arg, if there's only one.
6697  *
6698  * Returns the malloced/updated pat.
6699  * patternp and pat_count is the array of SVs to be concatted;
6700  * oplist is the optional list of ops that generated the SVs;
6701  * recompile_p is a pointer to a boolean that will be set if
6702  *   the regex will need to be recompiled.
6703  * delim, if non-null is an SV that will be inserted between each element
6704  */
6705
6706 static SV*
6707 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6708                 SV *pat, SV ** const patternp, int pat_count,
6709                 OP *oplist, bool *recompile_p, SV *delim)
6710 {
6711     SV **svp;
6712     int n = 0;
6713     bool use_delim = FALSE;
6714     bool alloced = FALSE;
6715
6716     /* if we know we have at least two args, create an empty string,
6717      * then concatenate args to that. For no args, return an empty string */
6718     if (!pat && pat_count != 1) {
6719         pat = newSVpvs("");
6720         SAVEFREESV(pat);
6721         alloced = TRUE;
6722     }
6723
6724     for (svp = patternp; svp < patternp + pat_count; svp++) {
6725         SV *sv;
6726         SV *rx  = NULL;
6727         STRLEN orig_patlen = 0;
6728         bool code = 0;
6729         SV *msv = use_delim ? delim : *svp;
6730         if (!msv) msv = &PL_sv_undef;
6731
6732         /* if we've got a delimiter, we go round the loop twice for each
6733          * svp slot (except the last), using the delimiter the second
6734          * time round */
6735         if (use_delim) {
6736             svp--;
6737             use_delim = FALSE;
6738         }
6739         else if (delim)
6740             use_delim = TRUE;
6741
6742         if (SvTYPE(msv) == SVt_PVAV) {
6743             /* we've encountered an interpolated array within
6744              * the pattern, e.g. /...@a..../. Expand the list of elements,
6745              * then recursively append elements.
6746              * The code in this block is based on S_pushav() */
6747
6748             AV *const av = (AV*)msv;
6749             const SSize_t maxarg = AvFILL(av) + 1;
6750             SV **array;
6751
6752             if (oplist) {
6753                 assert(oplist->op_type == OP_PADAV
6754                     || oplist->op_type == OP_RV2AV);
6755                 oplist = OpSIBLING(oplist);
6756             }
6757
6758             if (SvRMAGICAL(av)) {
6759                 SSize_t i;
6760
6761                 Newx(array, maxarg, SV*);
6762                 SAVEFREEPV(array);
6763                 for (i=0; i < maxarg; i++) {
6764                     SV ** const svp = av_fetch(av, i, FALSE);
6765                     array[i] = svp ? *svp : &PL_sv_undef;
6766                 }
6767             }
6768             else
6769                 array = AvARRAY(av);
6770
6771             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6772                                 array, maxarg, NULL, recompile_p,
6773                                 /* $" */
6774                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6775
6776             continue;
6777         }
6778
6779
6780         /* we make the assumption here that each op in the list of
6781          * op_siblings maps to one SV pushed onto the stack,
6782          * except for code blocks, with have both an OP_NULL and
6783          * and OP_CONST.
6784          * This allows us to match up the list of SVs against the
6785          * list of OPs to find the next code block.
6786          *
6787          * Note that       PUSHMARK PADSV PADSV ..
6788          * is optimised to
6789          *                 PADRANGE PADSV  PADSV  ..
6790          * so the alignment still works. */
6791
6792         if (oplist) {
6793             if (oplist->op_type == OP_NULL
6794                 && (oplist->op_flags & OPf_SPECIAL))
6795             {
6796                 assert(n < pRExC_state->code_blocks->count);
6797                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6798                 pRExC_state->code_blocks->cb[n].block = oplist;
6799                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6800                 n++;
6801                 code = 1;
6802                 oplist = OpSIBLING(oplist); /* skip CONST */
6803                 assert(oplist);
6804             }
6805             oplist = OpSIBLING(oplist);;
6806         }
6807
6808         /* apply magic and QR overloading to arg */
6809
6810         SvGETMAGIC(msv);
6811         if (SvROK(msv) && SvAMAGIC(msv)) {
6812             SV *sv = AMG_CALLunary(msv, regexp_amg);
6813             if (sv) {
6814                 if (SvROK(sv))
6815                     sv = SvRV(sv);
6816                 if (SvTYPE(sv) != SVt_REGEXP)
6817                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6818                 msv = sv;
6819             }
6820         }
6821
6822         /* try concatenation overload ... */
6823         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6824                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6825         {
6826             sv_setsv(pat, sv);
6827             /* overloading involved: all bets are off over literal
6828              * code. Pretend we haven't seen it */
6829             if (n)
6830                 pRExC_state->code_blocks->count -= n;
6831             n = 0;
6832         }
6833         else  {
6834             /* ... or failing that, try "" overload */
6835             while (SvAMAGIC(msv)
6836                     && (sv = AMG_CALLunary(msv, string_amg))
6837                     && sv != msv
6838                     &&  !(   SvROK(msv)
6839                           && SvROK(sv)
6840                           && SvRV(msv) == SvRV(sv))
6841             ) {
6842                 msv = sv;
6843                 SvGETMAGIC(msv);
6844             }
6845             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6846                 msv = SvRV(msv);
6847
6848             if (pat) {
6849                 /* this is a partially unrolled
6850                  *     sv_catsv_nomg(pat, msv);
6851                  * that allows us to adjust code block indices if
6852                  * needed */
6853                 STRLEN dlen;
6854                 char *dst = SvPV_force_nomg(pat, dlen);
6855                 orig_patlen = dlen;
6856                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6857                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6858                     sv_setpvn(pat, dst, dlen);
6859                     SvUTF8_on(pat);
6860                 }
6861                 sv_catsv_nomg(pat, msv);
6862                 rx = msv;
6863             }
6864             else {
6865                 /* We have only one SV to process, but we need to verify
6866                  * it is properly null terminated or we will fail asserts
6867                  * later. In theory we probably shouldn't get such SV's,
6868                  * but if we do we should handle it gracefully. */
6869                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6870                     /* not a string, or a string with a trailing null */
6871                     pat = msv;
6872                 } else {
6873                     /* a string with no trailing null, we need to copy it
6874                      * so it has a trailing null */
6875                     pat = sv_2mortal(newSVsv(msv));
6876                 }
6877             }
6878
6879             if (code)
6880                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6881         }
6882
6883         /* extract any code blocks within any embedded qr//'s */
6884         if (rx && SvTYPE(rx) == SVt_REGEXP
6885             && RX_ENGINE((REGEXP*)rx)->op_comp)
6886         {
6887
6888             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6889             if (ri->code_blocks && ri->code_blocks->count) {
6890                 int i;
6891                 /* the presence of an embedded qr// with code means
6892                  * we should always recompile: the text of the
6893                  * qr// may not have changed, but it may be a
6894                  * different closure than last time */
6895                 *recompile_p = 1;
6896                 if (pRExC_state->code_blocks) {
6897                     int new_count = pRExC_state->code_blocks->count
6898                             + ri->code_blocks->count;
6899                     Renew(pRExC_state->code_blocks->cb,
6900                             new_count, struct reg_code_block);
6901                     pRExC_state->code_blocks->count = new_count;
6902                 }
6903                 else
6904                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6905                                                     ri->code_blocks->count);
6906
6907                 for (i=0; i < ri->code_blocks->count; i++) {
6908                     struct reg_code_block *src, *dst;
6909                     STRLEN offset =  orig_patlen
6910                         + ReANY((REGEXP *)rx)->pre_prefix;
6911                     assert(n < pRExC_state->code_blocks->count);
6912                     src = &ri->code_blocks->cb[i];
6913                     dst = &pRExC_state->code_blocks->cb[n];
6914                     dst->start      = src->start + offset;
6915                     dst->end        = src->end   + offset;
6916                     dst->block      = src->block;
6917                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6918                                             src->src_regex
6919                                                 ? src->src_regex
6920                                                 : (REGEXP*)rx);
6921                     n++;
6922                 }
6923             }
6924         }
6925     }
6926     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6927     if (alloced)
6928         SvSETMAGIC(pat);
6929
6930     return pat;
6931 }
6932
6933
6934
6935 /* see if there are any run-time code blocks in the pattern.
6936  * False positives are allowed */
6937
6938 static bool
6939 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6940                     char *pat, STRLEN plen)
6941 {
6942     int n = 0;
6943     STRLEN s;
6944
6945     PERL_UNUSED_CONTEXT;
6946
6947     for (s = 0; s < plen; s++) {
6948         if (   pRExC_state->code_blocks
6949             && n < pRExC_state->code_blocks->count
6950             && s == pRExC_state->code_blocks->cb[n].start)
6951         {
6952             s = pRExC_state->code_blocks->cb[n].end;
6953             n++;
6954             continue;
6955         }
6956         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6957          * positives here */
6958         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6959             (pat[s+2] == '{'
6960                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6961         )
6962             return 1;
6963     }
6964     return 0;
6965 }
6966
6967 /* Handle run-time code blocks. We will already have compiled any direct
6968  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6969  * copy of it, but with any literal code blocks blanked out and
6970  * appropriate chars escaped; then feed it into
6971  *
6972  *    eval "qr'modified_pattern'"
6973  *
6974  * For example,
6975  *
6976  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6977  *
6978  * becomes
6979  *
6980  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6981  *
6982  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6983  * and merge them with any code blocks of the original regexp.
6984  *
6985  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6986  * instead, just save the qr and return FALSE; this tells our caller that
6987  * the original pattern needs upgrading to utf8.
6988  */
6989
6990 static bool
6991 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6992     char *pat, STRLEN plen)
6993 {
6994     SV *qr;
6995
6996     GET_RE_DEBUG_FLAGS_DECL;
6997
6998     if (pRExC_state->runtime_code_qr) {
6999         /* this is the second time we've been called; this should
7000          * only happen if the main pattern got upgraded to utf8
7001          * during compilation; re-use the qr we compiled first time
7002          * round (which should be utf8 too)
7003          */
7004         qr = pRExC_state->runtime_code_qr;
7005         pRExC_state->runtime_code_qr = NULL;
7006         assert(RExC_utf8 && SvUTF8(qr));
7007     }
7008     else {
7009         int n = 0;
7010         STRLEN s;
7011         char *p, *newpat;
7012         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7013         SV *sv, *qr_ref;
7014         dSP;
7015
7016         /* determine how many extra chars we need for ' and \ escaping */
7017         for (s = 0; s < plen; s++) {
7018             if (pat[s] == '\'' || pat[s] == '\\')
7019                 newlen++;
7020         }
7021
7022         Newx(newpat, newlen, char);
7023         p = newpat;
7024         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7025
7026         for (s = 0; s < plen; s++) {
7027             if (   pRExC_state->code_blocks
7028                 && n < pRExC_state->code_blocks->count
7029                 && s == pRExC_state->code_blocks->cb[n].start)
7030             {
7031                 /* blank out literal code block so that they aren't
7032                  * recompiled: eg change from/to:
7033                  *     /(?{xyz})/
7034                  *     /(?=====)/
7035                  * and
7036                  *     /(??{xyz})/
7037                  *     /(?======)/
7038                  * and
7039                  *     /(?(?{xyz}))/
7040                  *     /(?(?=====))/
7041                 */
7042                 assert(pat[s]   == '(');
7043                 assert(pat[s+1] == '?');
7044                 *p++ = '(';
7045                 *p++ = '?';
7046                 s += 2;
7047                 while (s < pRExC_state->code_blocks->cb[n].end) {
7048                     *p++ = '=';
7049                     s++;
7050                 }
7051                 *p++ = ')';
7052                 n++;
7053                 continue;
7054             }
7055             if (pat[s] == '\'' || pat[s] == '\\')
7056                 *p++ = '\\';
7057             *p++ = pat[s];
7058         }
7059         *p++ = '\'';
7060         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7061             *p++ = 'x';
7062             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7063                 *p++ = 'x';
7064             }
7065         }
7066         *p++ = '\0';
7067         DEBUG_COMPILE_r({
7068             Perl_re_printf( aTHX_
7069                 "%sre-parsing pattern for runtime code:%s %s\n",
7070                 PL_colors[4], PL_colors[5], newpat);
7071         });
7072
7073         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7074         Safefree(newpat);
7075
7076         ENTER;
7077         SAVETMPS;
7078         save_re_context();
7079         PUSHSTACKi(PERLSI_REQUIRE);
7080         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7081          * parsing qr''; normally only q'' does this. It also alters
7082          * hints handling */
7083         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7084         SvREFCNT_dec_NN(sv);
7085         SPAGAIN;
7086         qr_ref = POPs;
7087         PUTBACK;
7088         {
7089             SV * const errsv = ERRSV;
7090             if (SvTRUE_NN(errsv))
7091                 /* use croak_sv ? */
7092                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7093         }
7094         assert(SvROK(qr_ref));
7095         qr = SvRV(qr_ref);
7096         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7097         /* the leaving below frees the tmp qr_ref.
7098          * Give qr a life of its own */
7099         SvREFCNT_inc(qr);
7100         POPSTACK;
7101         FREETMPS;
7102         LEAVE;
7103
7104     }
7105
7106     if (!RExC_utf8 && SvUTF8(qr)) {
7107         /* first time through; the pattern got upgraded; save the
7108          * qr for the next time through */
7109         assert(!pRExC_state->runtime_code_qr);
7110         pRExC_state->runtime_code_qr = qr;
7111         return 0;
7112     }
7113
7114
7115     /* extract any code blocks within the returned qr//  */
7116
7117
7118     /* merge the main (r1) and run-time (r2) code blocks into one */
7119     {
7120         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7121         struct reg_code_block *new_block, *dst;
7122         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7123         int i1 = 0, i2 = 0;
7124         int r1c, r2c;
7125
7126         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7127         {
7128             SvREFCNT_dec_NN(qr);
7129             return 1;
7130         }
7131
7132         if (!r1->code_blocks)
7133             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7134
7135         r1c = r1->code_blocks->count;
7136         r2c = r2->code_blocks->count;
7137
7138         Newx(new_block, r1c + r2c, struct reg_code_block);
7139
7140         dst = new_block;
7141
7142         while (i1 < r1c || i2 < r2c) {
7143             struct reg_code_block *src;
7144             bool is_qr = 0;
7145
7146             if (i1 == r1c) {
7147                 src = &r2->code_blocks->cb[i2++];
7148                 is_qr = 1;
7149             }
7150             else if (i2 == r2c)
7151                 src = &r1->code_blocks->cb[i1++];
7152             else if (  r1->code_blocks->cb[i1].start
7153                      < r2->code_blocks->cb[i2].start)
7154             {
7155                 src = &r1->code_blocks->cb[i1++];
7156                 assert(src->end < r2->code_blocks->cb[i2].start);
7157             }
7158             else {
7159                 assert(  r1->code_blocks->cb[i1].start
7160                        > r2->code_blocks->cb[i2].start);
7161                 src = &r2->code_blocks->cb[i2++];
7162                 is_qr = 1;
7163                 assert(src->end < r1->code_blocks->cb[i1].start);
7164             }
7165
7166             assert(pat[src->start] == '(');
7167             assert(pat[src->end]   == ')');
7168             dst->start      = src->start;
7169             dst->end        = src->end;
7170             dst->block      = src->block;
7171             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7172                                     : src->src_regex;
7173             dst++;
7174         }
7175         r1->code_blocks->count += r2c;
7176         Safefree(r1->code_blocks->cb);
7177         r1->code_blocks->cb = new_block;
7178     }
7179
7180     SvREFCNT_dec_NN(qr);
7181     return 1;
7182 }
7183
7184
7185 STATIC bool
7186 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7187                       struct reg_substr_datum  *rsd,
7188                       struct scan_data_substrs *sub,
7189                       STRLEN longest_length)
7190 {
7191     /* This is the common code for setting up the floating and fixed length
7192      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7193      * as to whether succeeded or not */
7194
7195     I32 t;
7196     SSize_t ml;
7197     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7198     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7199
7200     if (! (longest_length
7201            || (eol /* Can't have SEOL and MULTI */
7202                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7203           )
7204             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7205         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7206     {
7207         return FALSE;
7208     }
7209
7210     /* copy the information about the longest from the reg_scan_data
7211         over to the program. */
7212     if (SvUTF8(sub->str)) {
7213         rsd->substr      = NULL;
7214         rsd->utf8_substr = sub->str;
7215     } else {
7216         rsd->substr      = sub->str;
7217         rsd->utf8_substr = NULL;
7218     }
7219     /* end_shift is how many chars that must be matched that
7220         follow this item. We calculate it ahead of time as once the
7221         lookbehind offset is added in we lose the ability to correctly
7222         calculate it.*/
7223     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7224     rsd->end_shift = ml - sub->min_offset
7225         - longest_length
7226             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7227              * intead? - DAPM
7228             + (SvTAIL(sub->str) != 0)
7229             */
7230         + sub->lookbehind;
7231
7232     t = (eol/* Can't have SEOL and MULTI */
7233          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7234     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7235
7236     return TRUE;
7237 }
7238
7239 STATIC void
7240 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7241 {
7242     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7243      * properly wrapped with the right modifiers */
7244
7245     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7246     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7247                                                 != REGEX_DEPENDS_CHARSET);
7248
7249     /* The caret is output if there are any defaults: if not all the STD
7250         * flags are set, or if no character set specifier is needed */
7251     bool has_default =
7252                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7253                 || ! has_charset);
7254     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7255                                                 == REG_RUN_ON_COMMENT_SEEN);
7256     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7257                         >> RXf_PMf_STD_PMMOD_SHIFT);
7258     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7259     char *p;
7260     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7261
7262     /* We output all the necessary flags; we never output a minus, as all
7263         * those are defaults, so are
7264         * covered by the caret */
7265     const STRLEN wraplen = pat_len + has_p + has_runon
7266         + has_default       /* If needs a caret */
7267         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7268
7269             /* If needs a character set specifier */
7270         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7271         + (sizeof("(?:)") - 1);
7272
7273     PERL_ARGS_ASSERT_SET_REGEX_PV;
7274
7275     /* make sure PL_bitcount bounds not exceeded */
7276     assert(sizeof(STD_PAT_MODS) <= 8);
7277
7278     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7279     SvPOK_on(Rx);
7280     if (RExC_utf8)
7281         SvFLAGS(Rx) |= SVf_UTF8;
7282     *p++='('; *p++='?';
7283
7284     /* If a default, cover it using the caret */
7285     if (has_default) {
7286         *p++= DEFAULT_PAT_MOD;
7287     }
7288     if (has_charset) {
7289         STRLEN len;
7290         const char* name;
7291
7292         name = get_regex_charset_name(RExC_rx->extflags, &len);
7293         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7294             assert(RExC_utf8);
7295             name = UNICODE_PAT_MODS;
7296             len = sizeof(UNICODE_PAT_MODS) - 1;
7297         }
7298         Copy(name, p, len, char);
7299         p += len;
7300     }
7301     if (has_p)
7302         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7303     {
7304         char ch;
7305         while((ch = *fptr++)) {
7306             if(reganch & 1)
7307                 *p++ = ch;
7308             reganch >>= 1;
7309         }
7310     }
7311
7312     *p++ = ':';
7313     Copy(RExC_precomp, p, pat_len, char);
7314     assert ((RX_WRAPPED(Rx) - p) < 16);
7315     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7316     p += pat_len;
7317
7318     /* Adding a trailing \n causes this to compile properly:
7319             my $R = qr / A B C # D E/x; /($R)/
7320         Otherwise the parens are considered part of the comment */
7321     if (has_runon)
7322         *p++ = '\n';
7323     *p++ = ')';
7324     *p = 0;
7325     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7326 }
7327
7328 /*
7329  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7330  * regular expression into internal code.
7331  * The pattern may be passed either as:
7332  *    a list of SVs (patternp plus pat_count)
7333  *    a list of OPs (expr)
7334  * If both are passed, the SV list is used, but the OP list indicates
7335  * which SVs are actually pre-compiled code blocks
7336  *
7337  * The SVs in the list have magic and qr overloading applied to them (and
7338  * the list may be modified in-place with replacement SVs in the latter
7339  * case).
7340  *
7341  * If the pattern hasn't changed from old_re, then old_re will be
7342  * returned.
7343  *
7344  * eng is the current engine. If that engine has an op_comp method, then
7345  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7346  * do the initial concatenation of arguments and pass on to the external
7347  * engine.
7348  *
7349  * If is_bare_re is not null, set it to a boolean indicating whether the
7350  * arg list reduced (after overloading) to a single bare regex which has
7351  * been returned (i.e. /$qr/).
7352  *
7353  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7354  *
7355  * pm_flags contains the PMf_* flags, typically based on those from the
7356  * pm_flags field of the related PMOP. Currently we're only interested in
7357  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7358  *
7359  * For many years this code had an initial sizing pass that calculated
7360  * (sometimes incorrectly, leading to security holes) the size needed for the
7361  * compiled pattern.  That was changed by commit
7362  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7363  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7364  * references to this sizing pass.
7365  *
7366  * Now, an initial crude guess as to the size needed is made, based on the
7367  * length of the pattern.  Patches welcome to improve that guess.  That amount
7368  * of space is malloc'd and then immediately freed, and then clawed back node
7369  * by node.  This design is to minimze, to the extent possible, memory churn
7370  * when doing the the reallocs.
7371  *
7372  * A separate parentheses counting pass may be needed in some cases.
7373  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7374  * of these cases.
7375  *
7376  * The existence of a sizing pass necessitated design decisions that are no
7377  * longer needed.  There are potential areas of simplification.
7378  *
7379  * Beware that the optimization-preparation code in here knows about some
7380  * of the structure of the compiled regexp.  [I'll say.]
7381  */
7382
7383 REGEXP *
7384 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7385                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7386                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7387 {
7388     dVAR;
7389     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7390     STRLEN plen;
7391     char *exp;
7392     regnode *scan;
7393     I32 flags;
7394     SSize_t minlen = 0;
7395     U32 rx_flags;
7396     SV *pat;
7397     SV** new_patternp = patternp;
7398
7399     /* these are all flags - maybe they should be turned
7400      * into a single int with different bit masks */
7401     I32 sawlookahead = 0;
7402     I32 sawplus = 0;
7403     I32 sawopen = 0;
7404     I32 sawminmod = 0;
7405
7406     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7407     bool recompile = 0;
7408     bool runtime_code = 0;
7409     scan_data_t data;
7410     RExC_state_t RExC_state;
7411     RExC_state_t * const pRExC_state = &RExC_state;
7412 #ifdef TRIE_STUDY_OPT
7413     int restudied = 0;
7414     RExC_state_t copyRExC_state;
7415 #endif
7416     GET_RE_DEBUG_FLAGS_DECL;
7417
7418     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7419
7420     DEBUG_r(if (!PL_colorset) reginitcolors());
7421
7422
7423     pRExC_state->warn_text = NULL;
7424     pRExC_state->unlexed_names = NULL;
7425     pRExC_state->code_blocks = NULL;
7426
7427     if (is_bare_re)
7428         *is_bare_re = FALSE;
7429
7430     if (expr && (expr->op_type == OP_LIST ||
7431                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7432         /* allocate code_blocks if needed */
7433         OP *o;
7434         int ncode = 0;
7435
7436         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7437             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7438                 ncode++; /* count of DO blocks */
7439
7440         if (ncode)
7441             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7442     }
7443
7444     if (!pat_count) {
7445         /* compile-time pattern with just OP_CONSTs and DO blocks */
7446
7447         int n;
7448         OP *o;
7449
7450         /* find how many CONSTs there are */
7451         assert(expr);
7452         n = 0;
7453         if (expr->op_type == OP_CONST)
7454             n = 1;
7455         else
7456             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7457                 if (o->op_type == OP_CONST)
7458                     n++;
7459             }
7460
7461         /* fake up an SV array */
7462
7463         assert(!new_patternp);
7464         Newx(new_patternp, n, SV*);
7465         SAVEFREEPV(new_patternp);
7466         pat_count = n;
7467
7468         n = 0;
7469         if (expr->op_type == OP_CONST)
7470             new_patternp[n] = cSVOPx_sv(expr);
7471         else
7472             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7473                 if (o->op_type == OP_CONST)
7474                     new_patternp[n++] = cSVOPo_sv;
7475             }
7476
7477     }
7478
7479     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7480         "Assembling pattern from %d elements%s\n", pat_count,
7481             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7482
7483     /* set expr to the first arg op */
7484
7485     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7486          && expr->op_type != OP_CONST)
7487     {
7488             expr = cLISTOPx(expr)->op_first;
7489             assert(   expr->op_type == OP_PUSHMARK
7490                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7491                    || expr->op_type == OP_PADRANGE);
7492             expr = OpSIBLING(expr);
7493     }
7494
7495     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7496                         expr, &recompile, NULL);
7497
7498     /* handle bare (possibly after overloading) regex: foo =~ $re */
7499     {
7500         SV *re = pat;
7501         if (SvROK(re))
7502             re = SvRV(re);
7503         if (SvTYPE(re) == SVt_REGEXP) {
7504             if (is_bare_re)
7505                 *is_bare_re = TRUE;
7506             SvREFCNT_inc(re);
7507             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7508                 "Precompiled pattern%s\n",
7509                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7510
7511             return (REGEXP*)re;
7512         }
7513     }
7514
7515     exp = SvPV_nomg(pat, plen);
7516
7517     if (!eng->op_comp) {
7518         if ((SvUTF8(pat) && IN_BYTES)
7519                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7520         {
7521             /* make a temporary copy; either to convert to bytes,
7522              * or to avoid repeating get-magic / overloaded stringify */
7523             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7524                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7525         }
7526         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7527     }
7528
7529     /* ignore the utf8ness if the pattern is 0 length */
7530     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7531     RExC_uni_semantics = 0;
7532     RExC_contains_locale = 0;
7533     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7534     RExC_in_script_run = 0;
7535     RExC_study_started = 0;
7536     pRExC_state->runtime_code_qr = NULL;
7537     RExC_frame_head= NULL;
7538     RExC_frame_last= NULL;
7539     RExC_frame_count= 0;
7540     RExC_latest_warn_offset = 0;
7541     RExC_use_BRANCHJ = 0;
7542     RExC_total_parens = 0;
7543     RExC_open_parens = NULL;
7544     RExC_close_parens = NULL;
7545     RExC_paren_names = NULL;
7546     RExC_size = 0;
7547     RExC_seen_d_op = FALSE;
7548 #ifdef DEBUGGING
7549     RExC_paren_name_list = NULL;
7550 #endif
7551
7552     DEBUG_r({
7553         RExC_mysv1= sv_newmortal();
7554         RExC_mysv2= sv_newmortal();
7555     });
7556
7557     DEBUG_COMPILE_r({
7558             SV *dsv= sv_newmortal();
7559             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7560             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7561                           PL_colors[4], PL_colors[5], s);
7562         });
7563
7564     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7565      * to utf8 */
7566
7567     if ((pm_flags & PMf_USE_RE_EVAL)
7568                 /* this second condition covers the non-regex literal case,
7569                  * i.e.  $foo =~ '(?{})'. */
7570                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7571     )
7572         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7573
7574   redo_parse:
7575     /* return old regex if pattern hasn't changed */
7576     /* XXX: note in the below we have to check the flags as well as the
7577      * pattern.
7578      *
7579      * Things get a touch tricky as we have to compare the utf8 flag
7580      * independently from the compile flags.  */
7581
7582     if (   old_re
7583         && !recompile
7584         && !!RX_UTF8(old_re) == !!RExC_utf8
7585         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7586         && RX_PRECOMP(old_re)
7587         && RX_PRELEN(old_re) == plen
7588         && memEQ(RX_PRECOMP(old_re), exp, plen)
7589         && !runtime_code /* with runtime code, always recompile */ )
7590     {
7591         DEBUG_COMPILE_r({
7592             SV *dsv= sv_newmortal();
7593             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7594             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
7595                           PL_colors[4], PL_colors[5], s);
7596         });
7597         return old_re;
7598     }
7599
7600     /* Allocate the pattern's SV */
7601     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7602     RExC_rx = ReANY(Rx);
7603     if ( RExC_rx == NULL )
7604         FAIL("Regexp out of space");
7605
7606     rx_flags = orig_rx_flags;
7607
7608     if (   (UTF || RExC_uni_semantics)
7609         && initial_charset == REGEX_DEPENDS_CHARSET)
7610     {
7611
7612         /* Set to use unicode semantics if the pattern is in utf8 and has the
7613          * 'depends' charset specified, as it means unicode when utf8  */
7614         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7615         RExC_uni_semantics = 1;
7616     }
7617
7618     RExC_pm_flags = pm_flags;
7619
7620     if (runtime_code) {
7621         assert(TAINTING_get || !TAINT_get);
7622         if (TAINT_get)
7623             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7624
7625         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7626             /* whoops, we have a non-utf8 pattern, whilst run-time code
7627              * got compiled as utf8. Try again with a utf8 pattern */
7628             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7629                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7630             goto redo_parse;
7631         }
7632     }
7633     assert(!pRExC_state->runtime_code_qr);
7634
7635     RExC_sawback = 0;
7636
7637     RExC_seen = 0;
7638     RExC_maxlen = 0;
7639     RExC_in_lookbehind = 0;
7640     RExC_in_lookahead = 0;
7641     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7642     RExC_recode_x_to_native = 0;
7643     RExC_in_multi_char_class = 0;
7644
7645     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7646     RExC_precomp_end = RExC_end = exp + plen;
7647     RExC_nestroot = 0;
7648     RExC_whilem_seen = 0;
7649     RExC_end_op = NULL;
7650     RExC_recurse = NULL;
7651     RExC_study_chunk_recursed = NULL;
7652     RExC_study_chunk_recursed_bytes= 0;
7653     RExC_recurse_count = 0;
7654     pRExC_state->code_index = 0;
7655
7656     /* Initialize the string in the compiled pattern.  This is so that there is
7657      * something to output if necessary */
7658     set_regex_pv(pRExC_state, Rx);
7659
7660     DEBUG_PARSE_r({
7661         Perl_re_printf( aTHX_
7662             "Starting parse and generation\n");
7663         RExC_lastnum=0;
7664         RExC_lastparse=NULL;
7665     });
7666
7667     /* Allocate space and zero-initialize. Note, the two step process
7668        of zeroing when in debug mode, thus anything assigned has to
7669        happen after that */
7670     if (!  RExC_size) {
7671
7672         /* On the first pass of the parse, we guess how big this will be.  Then
7673          * we grow in one operation to that amount and then give it back.  As
7674          * we go along, we re-allocate what we need.
7675          *
7676          * XXX Currently the guess is essentially that the pattern will be an
7677          * EXACT node with one byte input, one byte output.  This is crude, and
7678          * better heuristics are welcome.
7679          *
7680          * On any subsequent passes, we guess what we actually computed in the
7681          * latest earlier pass.  Such a pass probably didn't complete so is
7682          * missing stuff.  We could improve those guesses by knowing where the
7683          * parse stopped, and use the length so far plus apply the above
7684          * assumption to what's left. */
7685         RExC_size = STR_SZ(RExC_end - RExC_start);
7686     }
7687
7688     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7689     if ( RExC_rxi == NULL )
7690         FAIL("Regexp out of space");
7691
7692     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7693     RXi_SET( RExC_rx, RExC_rxi );
7694
7695     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7696      * node parsed will give back any excess memory we have allocated so far).
7697      * */
7698     RExC_size = 0;
7699
7700     /* non-zero initialization begins here */
7701     RExC_rx->engine= eng;
7702     RExC_rx->extflags = rx_flags;
7703     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7704
7705     if (pm_flags & PMf_IS_QR) {
7706         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7707         if (RExC_rxi->code_blocks) {
7708             RExC_rxi->code_blocks->refcnt++;
7709         }
7710     }
7711
7712     RExC_rx->intflags = 0;
7713
7714     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7715     RExC_parse = exp;
7716
7717     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7718      * code makes sure the final byte is an uncounted NUL.  But should this
7719      * ever not be the case, lots of things could read beyond the end of the
7720      * buffer: loops like
7721      *      while(isFOO(*RExC_parse)) RExC_parse++;
7722      *      strchr(RExC_parse, "foo");
7723      * etc.  So it is worth noting. */
7724     assert(*RExC_end == '\0');
7725
7726     RExC_naughty = 0;
7727     RExC_npar = 1;
7728     RExC_parens_buf_size = 0;
7729     RExC_emit_start = RExC_rxi->program;
7730     pRExC_state->code_index = 0;
7731
7732     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7733     RExC_emit = 1;
7734
7735     /* Do the parse */
7736     if (reg(pRExC_state, 0, &flags, 1)) {
7737
7738         /* Success!, But we may need to redo the parse knowing how many parens
7739          * there actually are */
7740         if (IN_PARENS_PASS) {
7741             flags |= RESTART_PARSE;
7742         }
7743
7744         /* We have that number in RExC_npar */
7745         RExC_total_parens = RExC_npar;
7746     }
7747     else if (! MUST_RESTART(flags)) {
7748         ReREFCNT_dec(Rx);
7749         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7750     }
7751
7752     /* Here, we either have success, or we have to redo the parse for some reason */
7753     if (MUST_RESTART(flags)) {
7754
7755         /* It's possible to write a regexp in ascii that represents Unicode
7756         codepoints outside of the byte range, such as via \x{100}. If we
7757         detect such a sequence we have to convert the entire pattern to utf8
7758         and then recompile, as our sizing calculation will have been based
7759         on 1 byte == 1 character, but we will need to use utf8 to encode
7760         at least some part of the pattern, and therefore must convert the whole
7761         thing.
7762         -- dmq */
7763         if (flags & NEED_UTF8) {
7764
7765             /* We have stored the offset of the final warning output so far.
7766              * That must be adjusted.  Any variant characters between the start
7767              * of the pattern and this warning count for 2 bytes in the final,
7768              * so just add them again */
7769             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7770                 RExC_latest_warn_offset +=
7771                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7772                                                 + RExC_latest_warn_offset);
7773             }
7774             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7775             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7776             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7777         }
7778         else {
7779             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7780         }
7781
7782         if (ALL_PARENS_COUNTED) {
7783             /* Make enough room for all the known parens, and zero it */
7784             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7785             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7786             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7787
7788             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7789             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7790         }
7791         else { /* Parse did not complete.  Reinitialize the parentheses
7792                   structures */
7793             RExC_total_parens = 0;
7794             if (RExC_open_parens) {
7795                 Safefree(RExC_open_parens);
7796                 RExC_open_parens = NULL;
7797             }
7798             if (RExC_close_parens) {
7799                 Safefree(RExC_close_parens);
7800                 RExC_close_parens = NULL;
7801             }
7802         }
7803
7804         /* Clean up what we did in this parse */
7805         SvREFCNT_dec_NN(RExC_rx_sv);
7806
7807         goto redo_parse;
7808     }
7809
7810     /* Here, we have successfully parsed and generated the pattern's program
7811      * for the regex engine.  We are ready to finish things up and look for
7812      * optimizations. */
7813
7814     /* Update the string to compile, with correct modifiers, etc */
7815     set_regex_pv(pRExC_state, Rx);
7816
7817     RExC_rx->nparens = RExC_total_parens - 1;
7818
7819     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7820     if (RExC_whilem_seen > 15)
7821         RExC_whilem_seen = 15;
7822
7823     DEBUG_PARSE_r({
7824         Perl_re_printf( aTHX_
7825             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7826         RExC_lastnum=0;
7827         RExC_lastparse=NULL;
7828     });
7829
7830 #ifdef RE_TRACK_PATTERN_OFFSETS
7831     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7832                           "%s %" UVuf " bytes for offset annotations.\n",
7833                           RExC_offsets ? "Got" : "Couldn't get",
7834                           (UV)((RExC_offsets[0] * 2 + 1))));
7835     DEBUG_OFFSETS_r(if (RExC_offsets) {
7836         const STRLEN len = RExC_offsets[0];
7837         STRLEN i;
7838         GET_RE_DEBUG_FLAGS_DECL;
7839         Perl_re_printf( aTHX_
7840                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7841         for (i = 1; i <= len; i++) {
7842             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7843                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7844                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7845         }
7846         Perl_re_printf( aTHX_  "\n");
7847     });
7848
7849 #else
7850     SetProgLen(RExC_rxi,RExC_size);
7851 #endif
7852
7853     DEBUG_DUMP_PRE_OPTIMIZE_r({
7854         SV * const sv = sv_newmortal();
7855         RXi_GET_DECL(RExC_rx, ri);
7856         DEBUG_RExC_seen();
7857         Perl_re_printf( aTHX_ "Program before optimization:\n");
7858
7859         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7860                         sv, 0, 0);
7861     });
7862
7863     DEBUG_OPTIMISE_r(
7864         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7865     );
7866
7867     /* XXXX To minimize changes to RE engine we always allocate
7868        3-units-long substrs field. */
7869     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7870     if (RExC_recurse_count) {
7871         Newx(RExC_recurse, RExC_recurse_count, regnode *);
7872         SAVEFREEPV(RExC_recurse);
7873     }
7874
7875     if (RExC_seen & REG_RECURSE_SEEN) {
7876         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7877          * So its 1 if there are no parens. */
7878         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7879                                          ((RExC_total_parens & 0x07) != 0);
7880         Newx(RExC_study_chunk_recursed,
7881              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7882         SAVEFREEPV(RExC_study_chunk_recursed);
7883     }
7884
7885   reStudy:
7886     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7887     DEBUG_r(
7888         RExC_study_chunk_recursed_count= 0;
7889     );
7890     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7891     if (RExC_study_chunk_recursed) {
7892         Zero(RExC_study_chunk_recursed,
7893              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7894     }
7895
7896
7897 #ifdef TRIE_STUDY_OPT
7898     if (!restudied) {
7899         StructCopy(&zero_scan_data, &data, scan_data_t);
7900         copyRExC_state = RExC_state;
7901     } else {
7902         U32 seen=RExC_seen;
7903         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7904
7905         RExC_state = copyRExC_state;
7906         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7907             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7908         else
7909             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7910         StructCopy(&zero_scan_data, &data, scan_data_t);
7911     }
7912 #else
7913     StructCopy(&zero_scan_data, &data, scan_data_t);
7914 #endif
7915
7916     /* Dig out information for optimizations. */
7917     RExC_rx->extflags = RExC_flags; /* was pm_op */
7918     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7919
7920     if (UTF)
7921         SvUTF8_on(Rx);  /* Unicode in it? */
7922     RExC_rxi->regstclass = NULL;
7923     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7924         RExC_rx->intflags |= PREGf_NAUGHTY;
7925     scan = RExC_rxi->program + 1;               /* First BRANCH. */
7926
7927     /* testing for BRANCH here tells us whether there is "must appear"
7928        data in the pattern. If there is then we can use it for optimisations */
7929     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7930                                                   */
7931         SSize_t fake;
7932         STRLEN longest_length[2];
7933         regnode_ssc ch_class; /* pointed to by data */
7934         int stclass_flag;
7935         SSize_t last_close = 0; /* pointed to by data */
7936         regnode *first= scan;
7937         regnode *first_next= regnext(first);
7938         int i;
7939
7940         /*
7941          * Skip introductions and multiplicators >= 1
7942          * so that we can extract the 'meat' of the pattern that must
7943          * match in the large if() sequence following.
7944          * NOTE that EXACT is NOT covered here, as it is normally
7945          * picked up by the optimiser separately.
7946          *
7947          * This is unfortunate as the optimiser isnt handling lookahead
7948          * properly currently.
7949          *
7950          */
7951         while ((OP(first) == OPEN && (sawopen = 1)) ||
7952                /* An OR of *one* alternative - should not happen now. */
7953             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7954             /* for now we can't handle lookbehind IFMATCH*/
7955             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7956             (OP(first) == PLUS) ||
7957             (OP(first) == MINMOD) ||
7958                /* An {n,m} with n>0 */
7959             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7960             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7961         {
7962                 /*
7963                  * the only op that could be a regnode is PLUS, all the rest
7964                  * will be regnode_1 or regnode_2.
7965                  *
7966                  * (yves doesn't think this is true)
7967                  */
7968                 if (OP(first) == PLUS)
7969                     sawplus = 1;
7970                 else {
7971                     if (OP(first) == MINMOD)
7972                         sawminmod = 1;
7973                     first += regarglen[OP(first)];
7974                 }
7975                 first = NEXTOPER(first);
7976                 first_next= regnext(first);
7977         }
7978
7979         /* Starting-point info. */
7980       again:
7981         DEBUG_PEEP("first:", first, 0, 0);
7982         /* Ignore EXACT as we deal with it later. */
7983         if (PL_regkind[OP(first)] == EXACT) {
7984             if (   OP(first) == EXACT
7985                 || OP(first) == LEXACT
7986                 || OP(first) == EXACT_REQ8
7987                 || OP(first) == LEXACT_REQ8
7988                 || OP(first) == EXACTL)
7989             {
7990                 NOOP;   /* Empty, get anchored substr later. */
7991             }
7992             else
7993                 RExC_rxi->regstclass = first;
7994         }
7995 #ifdef TRIE_STCLASS
7996         else if (PL_regkind[OP(first)] == TRIE &&
7997                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
7998         {
7999             /* this can happen only on restudy */
8000             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8001         }
8002 #endif
8003         else if (REGNODE_SIMPLE(OP(first)))
8004             RExC_rxi->regstclass = first;
8005         else if (PL_regkind[OP(first)] == BOUND ||
8006                  PL_regkind[OP(first)] == NBOUND)
8007             RExC_rxi->regstclass = first;
8008         else if (PL_regkind[OP(first)] == BOL) {
8009             RExC_rx->intflags |= (OP(first) == MBOL
8010                            ? PREGf_ANCH_MBOL
8011                            : PREGf_ANCH_SBOL);
8012             first = NEXTOPER(first);
8013             goto again;
8014         }
8015         else if (OP(first) == GPOS) {
8016             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8017             first = NEXTOPER(first);
8018             goto again;
8019         }
8020         else if ((!sawopen || !RExC_sawback) &&
8021             !sawlookahead &&
8022             (OP(first) == STAR &&
8023             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8024             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8025         {
8026             /* turn .* into ^.* with an implied $*=1 */
8027             const int type =
8028                 (OP(NEXTOPER(first)) == REG_ANY)
8029                     ? PREGf_ANCH_MBOL
8030                     : PREGf_ANCH_SBOL;
8031             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8032             first = NEXTOPER(first);
8033             goto again;
8034         }
8035         if (sawplus && !sawminmod && !sawlookahead
8036             && (!sawopen || !RExC_sawback)
8037             && !pRExC_state->code_blocks) /* May examine pos and $& */
8038             /* x+ must match at the 1st pos of run of x's */
8039             RExC_rx->intflags |= PREGf_SKIP;
8040
8041         /* Scan is after the zeroth branch, first is atomic matcher. */
8042 #ifdef TRIE_STUDY_OPT
8043         DEBUG_PARSE_r(
8044             if (!restudied)
8045                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8046                               (IV)(first - scan + 1))
8047         );
8048 #else
8049         DEBUG_PARSE_r(
8050             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8051                 (IV)(first - scan + 1))
8052         );
8053 #endif
8054
8055
8056         /*
8057         * If there's something expensive in the r.e., find the
8058         * longest literal string that must appear and make it the
8059         * regmust.  Resolve ties in favor of later strings, since
8060         * the regstart check works with the beginning of the r.e.
8061         * and avoiding duplication strengthens checking.  Not a
8062         * strong reason, but sufficient in the absence of others.
8063         * [Now we resolve ties in favor of the earlier string if
8064         * it happens that c_offset_min has been invalidated, since the
8065         * earlier string may buy us something the later one won't.]
8066         */
8067
8068         data.substrs[0].str = newSVpvs("");
8069         data.substrs[1].str = newSVpvs("");
8070         data.last_found = newSVpvs("");
8071         data.cur_is_floating = 0; /* initially any found substring is fixed */
8072         ENTER_with_name("study_chunk");
8073         SAVEFREESV(data.substrs[0].str);
8074         SAVEFREESV(data.substrs[1].str);
8075         SAVEFREESV(data.last_found);
8076         first = scan;
8077         if (!RExC_rxi->regstclass) {
8078             ssc_init(pRExC_state, &ch_class);
8079             data.start_class = &ch_class;
8080             stclass_flag = SCF_DO_STCLASS_AND;
8081         } else                          /* XXXX Check for BOUND? */
8082             stclass_flag = 0;
8083         data.last_closep = &last_close;
8084
8085         DEBUG_RExC_seen();
8086         /*
8087          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8088          * (NO top level branches)
8089          */
8090         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8091                              scan + RExC_size, /* Up to end */
8092             &data, -1, 0, NULL,
8093             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8094                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8095             0);
8096
8097
8098         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8099
8100
8101         if ( RExC_total_parens == 1 && !data.cur_is_floating
8102              && data.last_start_min == 0 && data.last_end > 0
8103              && !RExC_seen_zerolen
8104              && !(RExC_seen & REG_VERBARG_SEEN)
8105              && !(RExC_seen & REG_GPOS_SEEN)
8106         ){
8107             RExC_rx->extflags |= RXf_CHECK_ALL;
8108         }
8109         scan_commit(pRExC_state, &data,&minlen, 0);
8110
8111
8112         /* XXX this is done in reverse order because that's the way the
8113          * code was before it was parameterised. Don't know whether it
8114          * actually needs doing in reverse order. DAPM */
8115         for (i = 1; i >= 0; i--) {
8116             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8117
8118             if (   !(   i
8119                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8120                      &&    data.substrs[0].min_offset
8121                         == data.substrs[1].min_offset
8122                      &&    SvCUR(data.substrs[0].str)
8123                         == SvCUR(data.substrs[1].str)
8124                     )
8125                 && S_setup_longest (aTHX_ pRExC_state,
8126                                         &(RExC_rx->substrs->data[i]),
8127                                         &(data.substrs[i]),
8128                                         longest_length[i]))
8129             {
8130                 RExC_rx->substrs->data[i].min_offset =
8131                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8132
8133                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8134                 /* Don't offset infinity */
8135                 if (data.substrs[i].max_offset < SSize_t_MAX)
8136                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8137                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8138             }
8139             else {
8140                 RExC_rx->substrs->data[i].substr      = NULL;
8141                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8142                 longest_length[i] = 0;
8143             }
8144         }
8145
8146         LEAVE_with_name("study_chunk");
8147
8148         if (RExC_rxi->regstclass
8149             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8150             RExC_rxi->regstclass = NULL;
8151
8152         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8153               || RExC_rx->substrs->data[0].min_offset)
8154             && stclass_flag
8155             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8156             && is_ssc_worth_it(pRExC_state, data.start_class))
8157         {
8158             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8159
8160             ssc_finalize(pRExC_state, data.start_class);
8161
8162             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8163             StructCopy(data.start_class,
8164                        (regnode_ssc*)RExC_rxi->data->data[n],
8165                        regnode_ssc);
8166             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8167             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8168             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8169                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8170                       Perl_re_printf( aTHX_
8171                                     "synthetic stclass \"%s\".\n",
8172                                     SvPVX_const(sv));});
8173             data.start_class = NULL;
8174         }
8175
8176         /* A temporary algorithm prefers floated substr to fixed one of
8177          * same length to dig more info. */
8178         i = (longest_length[0] <= longest_length[1]);
8179         RExC_rx->substrs->check_ix = i;
8180         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8181         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8182         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8183         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8184         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8185         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8186             RExC_rx->intflags |= PREGf_NOSCAN;
8187
8188         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8189             RExC_rx->extflags |= RXf_USE_INTUIT;
8190             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8191                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8192         }
8193
8194         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8195         if ( (STRLEN)minlen < longest_length[1] )
8196             minlen= longest_length[1];
8197         if ( (STRLEN)minlen < longest_length[0] )
8198             minlen= longest_length[0];
8199         */
8200     }
8201     else {
8202         /* Several toplevels. Best we can is to set minlen. */
8203         SSize_t fake;
8204         regnode_ssc ch_class;
8205         SSize_t last_close = 0;
8206
8207         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8208
8209         scan = RExC_rxi->program + 1;
8210         ssc_init(pRExC_state, &ch_class);
8211         data.start_class = &ch_class;
8212         data.last_closep = &last_close;
8213
8214         DEBUG_RExC_seen();
8215         /*
8216          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8217          * (patterns WITH top level branches)
8218          */
8219         minlen = study_chunk(pRExC_state,
8220             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8221             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8222                                                       ? SCF_TRIE_DOING_RESTUDY
8223                                                       : 0),
8224             0);
8225
8226         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8227
8228         RExC_rx->check_substr = NULL;
8229         RExC_rx->check_utf8 = NULL;
8230         RExC_rx->substrs->data[0].substr      = NULL;
8231         RExC_rx->substrs->data[0].utf8_substr = NULL;
8232         RExC_rx->substrs->data[1].substr      = NULL;
8233         RExC_rx->substrs->data[1].utf8_substr = NULL;
8234
8235         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8236             && is_ssc_worth_it(pRExC_state, data.start_class))
8237         {
8238             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8239
8240             ssc_finalize(pRExC_state, data.start_class);
8241
8242             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8243             StructCopy(data.start_class,
8244                        (regnode_ssc*)RExC_rxi->data->data[n],
8245                        regnode_ssc);
8246             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8247             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8248             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8249                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8250                       Perl_re_printf( aTHX_
8251                                     "synthetic stclass \"%s\".\n",
8252                                     SvPVX_const(sv));});
8253             data.start_class = NULL;
8254         }
8255     }
8256
8257     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8258         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8259         RExC_rx->maxlen = REG_INFTY;
8260     }
8261     else {
8262         RExC_rx->maxlen = RExC_maxlen;
8263     }
8264
8265     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8266        the "real" pattern. */
8267     DEBUG_OPTIMISE_r({
8268         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8269                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8270     });
8271     RExC_rx->minlenret = minlen;
8272     if (RExC_rx->minlen < minlen)
8273         RExC_rx->minlen = minlen;
8274
8275     if (RExC_seen & REG_RECURSE_SEEN ) {
8276         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8277         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8278     }
8279     if (RExC_seen & REG_GPOS_SEEN)
8280         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8281     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8282         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8283                                                 lookbehind */
8284     if (pRExC_state->code_blocks)
8285         RExC_rx->extflags |= RXf_EVAL_SEEN;
8286     if (RExC_seen & REG_VERBARG_SEEN)
8287     {
8288         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8289         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8290     }
8291     if (RExC_seen & REG_CUTGROUP_SEEN)
8292         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8293     if (pm_flags & PMf_USE_RE_EVAL)
8294         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8295     if (RExC_paren_names)
8296         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8297     else
8298         RXp_PAREN_NAMES(RExC_rx) = NULL;
8299
8300     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8301      * so it can be used in pp.c */
8302     if (RExC_rx->intflags & PREGf_ANCH)
8303         RExC_rx->extflags |= RXf_IS_ANCHORED;
8304
8305
8306     {
8307         /* this is used to identify "special" patterns that might result
8308          * in Perl NOT calling the regex engine and instead doing the match "itself",
8309          * particularly special cases in split//. By having the regex compiler
8310          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8311          * we avoid weird issues with equivalent patterns resulting in different behavior,
8312          * AND we allow non Perl engines to get the same optimizations by the setting the
8313          * flags appropriately - Yves */
8314         regnode *first = RExC_rxi->program + 1;
8315         U8 fop = OP(first);
8316         regnode *next = regnext(first);
8317         U8 nop = OP(next);
8318
8319         if (PL_regkind[fop] == NOTHING && nop == END)
8320             RExC_rx->extflags |= RXf_NULL;
8321         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8322             /* when fop is SBOL first->flags will be true only when it was
8323              * produced by parsing /\A/, and not when parsing /^/. This is
8324              * very important for the split code as there we want to
8325              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8326              * See rt #122761 for more details. -- Yves */
8327             RExC_rx->extflags |= RXf_START_ONLY;
8328         else if (fop == PLUS
8329                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8330                  && nop == END)
8331             RExC_rx->extflags |= RXf_WHITE;
8332         else if ( RExC_rx->extflags & RXf_SPLIT
8333                   && (   fop == EXACT || fop == LEXACT
8334                       || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8335                       || fop == EXACTL)
8336                   && STR_LEN(first) == 1
8337                   && *(STRING(first)) == ' '
8338                   && nop == END )
8339             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8340
8341     }
8342
8343     if (RExC_contains_locale) {
8344         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8345     }
8346
8347 #ifdef DEBUGGING
8348     if (RExC_paren_names) {
8349         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8350         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8351                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8352     } else
8353 #endif
8354     RExC_rxi->name_list_idx = 0;
8355
8356     while ( RExC_recurse_count > 0 ) {
8357         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8358         /*
8359          * This data structure is set up in study_chunk() and is used
8360          * to calculate the distance between a GOSUB regopcode and
8361          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8362          * it refers to.
8363          *
8364          * If for some reason someone writes code that optimises
8365          * away a GOSUB opcode then the assert should be changed to
8366          * an if(scan) to guard the ARG2L_SET() - Yves
8367          *
8368          */
8369         assert(scan && OP(scan) == GOSUB);
8370         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8371     }
8372
8373     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8374     /* assume we don't need to swap parens around before we match */
8375     DEBUG_TEST_r({
8376         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8377             (unsigned long)RExC_study_chunk_recursed_count);
8378     });
8379     DEBUG_DUMP_r({
8380         DEBUG_RExC_seen();
8381         Perl_re_printf( aTHX_ "Final program:\n");
8382         regdump(RExC_rx);
8383     });
8384
8385     if (RExC_open_parens) {
8386         Safefree(RExC_open_parens);
8387         RExC_open_parens = NULL;
8388     }
8389     if (RExC_close_parens) {
8390         Safefree(RExC_close_parens);
8391         RExC_close_parens = NULL;
8392     }
8393
8394 #ifdef USE_ITHREADS
8395     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8396      * by setting the regexp SV to readonly-only instead. If the
8397      * pattern's been recompiled, the USEDness should remain. */
8398     if (old_re && SvREADONLY(old_re))
8399         SvREADONLY_on(Rx);
8400 #endif
8401     return Rx;
8402 }
8403
8404
8405 SV*
8406 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8407                     const U32 flags)
8408 {
8409     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8410
8411     PERL_UNUSED_ARG(value);
8412
8413     if (flags & RXapif_FETCH) {
8414         return reg_named_buff_fetch(rx, key, flags);
8415     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8416         Perl_croak_no_modify();
8417         return NULL;
8418     } else if (flags & RXapif_EXISTS) {
8419         return reg_named_buff_exists(rx, key, flags)
8420             ? &PL_sv_yes
8421             : &PL_sv_no;
8422     } else if (flags & RXapif_REGNAMES) {
8423         return reg_named_buff_all(rx, flags);
8424     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8425         return reg_named_buff_scalar(rx, flags);
8426     } else {
8427         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8428         return NULL;
8429     }
8430 }
8431
8432 SV*
8433 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8434                          const U32 flags)
8435 {
8436     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8437     PERL_UNUSED_ARG(lastkey);
8438
8439     if (flags & RXapif_FIRSTKEY)
8440         return reg_named_buff_firstkey(rx, flags);
8441     else if (flags & RXapif_NEXTKEY)
8442         return reg_named_buff_nextkey(rx, flags);
8443     else {
8444         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8445                                             (int)flags);
8446         return NULL;
8447     }
8448 }
8449
8450 SV*
8451 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8452                           const U32 flags)
8453 {
8454     SV *ret;
8455     struct regexp *const rx = ReANY(r);
8456
8457     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8458
8459     if (rx && RXp_PAREN_NAMES(rx)) {
8460         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8461         if (he_str) {
8462             IV i;
8463             SV* sv_dat=HeVAL(he_str);
8464             I32 *nums=(I32*)SvPVX(sv_dat);
8465             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8466             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8467                 if ((I32)(rx->nparens) >= nums[i]
8468                     && rx->offs[nums[i]].start != -1
8469                     && rx->offs[nums[i]].end != -1)
8470                 {
8471                     ret = newSVpvs("");
8472                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8473                     if (!retarray)
8474                         return ret;
8475                 } else {
8476                     if (retarray)
8477                         ret = newSVsv(&PL_sv_undef);
8478                 }
8479                 if (retarray)
8480                     av_push(retarray, ret);
8481             }
8482             if (retarray)
8483                 return newRV_noinc(MUTABLE_SV(retarray));
8484         }
8485     }
8486     return NULL;
8487 }
8488
8489 bool
8490 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8491                            const U32 flags)
8492 {
8493     struct regexp *const rx = ReANY(r);
8494
8495     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8496
8497     if (rx && RXp_PAREN_NAMES(rx)) {
8498         if (flags & RXapif_ALL) {
8499             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8500         } else {
8501             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8502             if (sv) {
8503                 SvREFCNT_dec_NN(sv);
8504                 return TRUE;
8505             } else {
8506                 return FALSE;
8507             }
8508         }
8509     } else {
8510         return FALSE;
8511     }
8512 }
8513
8514 SV*
8515 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8516 {
8517     struct regexp *const rx = ReANY(r);
8518
8519     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8520
8521     if ( rx && RXp_PAREN_NAMES(rx) ) {
8522         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8523
8524         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8525     } else {
8526         return FALSE;
8527     }
8528 }
8529
8530 SV*
8531 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8532 {
8533     struct regexp *const rx = ReANY(r);
8534     GET_RE_DEBUG_FLAGS_DECL;
8535
8536     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8537
8538     if (rx && RXp_PAREN_NAMES(rx)) {
8539         HV *hv = RXp_PAREN_NAMES(rx);
8540         HE *temphe;
8541         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8542             IV i;
8543             IV parno = 0;
8544             SV* sv_dat = HeVAL(temphe);
8545             I32 *nums = (I32*)SvPVX(sv_dat);
8546             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8547                 if ((I32)(rx->lastparen) >= nums[i] &&
8548                     rx->offs[nums[i]].start != -1 &&
8549                     rx->offs[nums[i]].end != -1)
8550                 {
8551                     parno = nums[i];
8552                     break;
8553                 }
8554             }
8555             if (parno || flags & RXapif_ALL) {
8556                 return newSVhek(HeKEY_hek(temphe));
8557             }
8558         }
8559     }
8560     return NULL;
8561 }
8562
8563 SV*
8564 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8565 {
8566     SV *ret;
8567     AV *av;
8568     SSize_t length;
8569     struct regexp *const rx = ReANY(r);
8570
8571     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8572
8573     if (rx && RXp_PAREN_NAMES(rx)) {
8574         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8575             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8576         } else if (flags & RXapif_ONE) {
8577             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8578             av = MUTABLE_AV(SvRV(ret));
8579             length = av_tindex(av);
8580             SvREFCNT_dec_NN(ret);
8581             return newSViv(length + 1);
8582         } else {
8583             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8584                                                 (int)flags);
8585             return NULL;
8586         }
8587     }
8588     return &PL_sv_undef;
8589 }
8590
8591 SV*
8592 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8593 {
8594     struct regexp *const rx = ReANY(r);
8595     AV *av = newAV();
8596
8597     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8598
8599     if (rx && RXp_PAREN_NAMES(rx)) {
8600         HV *hv= RXp_PAREN_NAMES(rx);
8601         HE *temphe;
8602         (void)hv_iterinit(hv);
8603         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8604             IV i;
8605             IV parno = 0;
8606             SV* sv_dat = HeVAL(temphe);
8607             I32 *nums = (I32*)SvPVX(sv_dat);
8608             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8609                 if ((I32)(rx->lastparen) >= nums[i] &&
8610                     rx->offs[nums[i]].start != -1 &&
8611                     rx->offs[nums[i]].end != -1)
8612                 {
8613                     parno = nums[i];
8614                     break;
8615                 }
8616             }
8617             if (parno || flags & RXapif_ALL) {
8618                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8619             }
8620         }
8621     }
8622
8623     return newRV_noinc(MUTABLE_SV(av));
8624 }
8625
8626 void
8627 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8628                              SV * const sv)
8629 {
8630     struct regexp *const rx = ReANY(r);
8631     char *s = NULL;
8632     SSize_t i = 0;
8633     SSize_t s1, t1;
8634     I32 n = paren;
8635
8636     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8637
8638     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8639            || n == RX_BUFF_IDX_CARET_FULLMATCH
8640            || n == RX_BUFF_IDX_CARET_POSTMATCH
8641        )
8642     {
8643         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8644         if (!keepcopy) {
8645             /* on something like
8646              *    $r = qr/.../;
8647              *    /$qr/p;
8648              * the KEEPCOPY is set on the PMOP rather than the regex */
8649             if (PL_curpm && r == PM_GETRE(PL_curpm))
8650                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8651         }
8652         if (!keepcopy)
8653             goto ret_undef;
8654     }
8655
8656     if (!rx->subbeg)
8657         goto ret_undef;
8658
8659     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8660         /* no need to distinguish between them any more */
8661         n = RX_BUFF_IDX_FULLMATCH;
8662
8663     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8664         && rx->offs[0].start != -1)
8665     {
8666         /* $`, ${^PREMATCH} */
8667         i = rx->offs[0].start;
8668         s = rx->subbeg;
8669     }
8670     else
8671     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8672         && rx->offs[0].end != -1)
8673     {
8674         /* $', ${^POSTMATCH} */
8675         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8676         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8677     }
8678     else
8679     if (inRANGE(n, 0, (I32)rx->nparens) &&
8680         (s1 = rx->offs[n].start) != -1  &&
8681         (t1 = rx->offs[n].end) != -1)
8682     {
8683         /* $&, ${^MATCH},  $1 ... */
8684         i = t1 - s1;
8685         s = rx->subbeg + s1 - rx->suboffset;
8686     } else {
8687         goto ret_undef;
8688     }
8689
8690     assert(s >= rx->subbeg);
8691     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8692     if (i >= 0) {
8693 #ifdef NO_TAINT_SUPPORT
8694         sv_setpvn(sv, s, i);
8695 #else
8696         const int oldtainted = TAINT_get;
8697         TAINT_NOT;
8698         sv_setpvn(sv, s, i);
8699         TAINT_set(oldtainted);
8700 #endif
8701         if (RXp_MATCH_UTF8(rx))
8702             SvUTF8_on(sv);
8703         else
8704             SvUTF8_off(sv);
8705         if (TAINTING_get) {
8706             if (RXp_MATCH_TAINTED(rx)) {
8707                 if (SvTYPE(sv) >= SVt_PVMG) {
8708                     MAGIC* const mg = SvMAGIC(sv);
8709                     MAGIC* mgt;
8710                     TAINT;
8711                     SvMAGIC_set(sv, mg->mg_moremagic);
8712                     SvTAINT(sv);
8713                     if ((mgt = SvMAGIC(sv))) {
8714                         mg->mg_moremagic = mgt;
8715                         SvMAGIC_set(sv, mg);
8716                     }
8717                 } else {
8718                     TAINT;
8719                     SvTAINT(sv);
8720                 }
8721             } else
8722                 SvTAINTED_off(sv);
8723         }
8724     } else {
8725       ret_undef:
8726         sv_set_undef(sv);
8727         return;
8728     }
8729 }
8730
8731 void
8732 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8733                                                          SV const * const value)
8734 {
8735     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8736
8737     PERL_UNUSED_ARG(rx);
8738     PERL_UNUSED_ARG(paren);
8739     PERL_UNUSED_ARG(value);
8740
8741     if (!PL_localizing)
8742         Perl_croak_no_modify();
8743 }
8744
8745 I32
8746 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8747                               const I32 paren)
8748 {
8749     struct regexp *const rx = ReANY(r);
8750     I32 i;
8751     I32 s1, t1;
8752
8753     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8754
8755     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8756         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8757         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8758     )
8759     {
8760         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8761         if (!keepcopy) {
8762             /* on something like
8763              *    $r = qr/.../;
8764              *    /$qr/p;
8765              * the KEEPCOPY is set on the PMOP rather than the regex */
8766             if (PL_curpm && r == PM_GETRE(PL_curpm))
8767                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8768         }
8769         if (!keepcopy)
8770             goto warn_undef;
8771     }
8772
8773     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8774     switch (paren) {
8775       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8776       case RX_BUFF_IDX_PREMATCH:       /* $` */
8777         if (rx->offs[0].start != -1) {
8778                         i = rx->offs[0].start;
8779                         if (i > 0) {
8780                                 s1 = 0;
8781                                 t1 = i;
8782                                 goto getlen;
8783                         }
8784             }
8785         return 0;
8786
8787       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8788       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8789             if (rx->offs[0].end != -1) {
8790                         i = rx->sublen - rx->offs[0].end;
8791                         if (i > 0) {
8792                                 s1 = rx->offs[0].end;
8793                                 t1 = rx->sublen;
8794                                 goto getlen;
8795                         }
8796             }
8797         return 0;
8798
8799       default: /* $& / ${^MATCH}, $1, $2, ... */
8800             if (paren <= (I32)rx->nparens &&
8801             (s1 = rx->offs[paren].start) != -1 &&
8802             (t1 = rx->offs[paren].end) != -1)
8803             {
8804             i = t1 - s1;
8805             goto getlen;
8806         } else {
8807           warn_undef:
8808             if (ckWARN(WARN_UNINITIALIZED))
8809                 report_uninit((const SV *)sv);
8810             return 0;
8811         }
8812     }
8813   getlen:
8814     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8815         const char * const s = rx->subbeg - rx->suboffset + s1;
8816         const U8 *ep;
8817         STRLEN el;
8818
8819         i = t1 - s1;
8820         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8821                         i = el;
8822     }
8823     return i;
8824 }
8825
8826 SV*
8827 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8828 {
8829     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8830         PERL_UNUSED_ARG(rx);
8831         if (0)
8832             return NULL;
8833         else
8834             return newSVpvs("Regexp");
8835 }
8836
8837 /* Scans the name of a named buffer from the pattern.
8838  * If flags is REG_RSN_RETURN_NULL returns null.
8839  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8840  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8841  * to the parsed name as looked up in the RExC_paren_names hash.
8842  * If there is an error throws a vFAIL().. type exception.
8843  */
8844
8845 #define REG_RSN_RETURN_NULL    0
8846 #define REG_RSN_RETURN_NAME    1
8847 #define REG_RSN_RETURN_DATA    2
8848
8849 STATIC SV*
8850 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8851 {
8852     char *name_start = RExC_parse;
8853     SV* sv_name;
8854
8855     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8856
8857     assert (RExC_parse <= RExC_end);
8858     if (RExC_parse == RExC_end) NOOP;
8859     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8860          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8861           * using do...while */
8862         if (UTF)
8863             do {
8864                 RExC_parse += UTF8SKIP(RExC_parse);
8865             } while (   RExC_parse < RExC_end
8866                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8867         else
8868             do {
8869                 RExC_parse++;
8870             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8871     } else {
8872         RExC_parse++; /* so the <- from the vFAIL is after the offending
8873                          character */
8874         vFAIL("Group name must start with a non-digit word character");
8875     }
8876     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8877                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8878     if ( flags == REG_RSN_RETURN_NAME)
8879         return sv_name;
8880     else if (flags==REG_RSN_RETURN_DATA) {
8881         HE *he_str = NULL;
8882         SV *sv_dat = NULL;
8883         if ( ! sv_name )      /* should not happen*/
8884             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8885         if (RExC_paren_names)
8886             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8887         if ( he_str )
8888             sv_dat = HeVAL(he_str);
8889         if ( ! sv_dat ) {   /* Didn't find group */
8890
8891             /* It might be a forward reference; we can't fail until we
8892                 * know, by completing the parse to get all the groups, and
8893                 * then reparsing */
8894             if (ALL_PARENS_COUNTED)  {
8895                 vFAIL("Reference to nonexistent named group");
8896             }
8897             else {
8898                 REQUIRE_PARENS_PASS;
8899             }
8900         }
8901         return sv_dat;
8902     }
8903
8904     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8905                      (unsigned long) flags);
8906 }
8907
8908 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8909     if (RExC_lastparse!=RExC_parse) {                           \
8910         Perl_re_printf( aTHX_  "%s",                            \
8911             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8912                 RExC_end - RExC_parse, 16,                      \
8913                 "", "",                                         \
8914                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8915                 PERL_PV_PRETTY_ELLIPSES   |                     \
8916                 PERL_PV_PRETTY_LTGT       |                     \
8917                 PERL_PV_ESCAPE_RE         |                     \
8918                 PERL_PV_PRETTY_EXACTSIZE                        \
8919             )                                                   \
8920         );                                                      \
8921     } else                                                      \
8922         Perl_re_printf( aTHX_ "%16s","");                       \
8923                                                                 \
8924     if (RExC_lastnum!=RExC_emit)                                \
8925        Perl_re_printf( aTHX_ "|%4d", RExC_emit);                \
8926     else                                                        \
8927        Perl_re_printf( aTHX_ "|%4s","");                        \
8928     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
8929         (int)((depth*2)), "",                                   \
8930         (funcname)                                              \
8931     );                                                          \
8932     RExC_lastnum=RExC_emit;                                     \
8933     RExC_lastparse=RExC_parse;                                  \
8934 })
8935
8936
8937
8938 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8939     DEBUG_PARSE_MSG((funcname));                            \
8940     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8941 })
8942 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8943     DEBUG_PARSE_MSG((funcname));                            \
8944     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8945 })
8946
8947 /* This section of code defines the inversion list object and its methods.  The
8948  * interfaces are highly subject to change, so as much as possible is static to
8949  * this file.  An inversion list is here implemented as a malloc'd C UV array
8950  * as an SVt_INVLIST scalar.
8951  *
8952  * An inversion list for Unicode is an array of code points, sorted by ordinal
8953  * number.  Each element gives the code point that begins a range that extends
8954  * up-to but not including the code point given by the next element.  The final
8955  * element gives the first code point of a range that extends to the platform's
8956  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8957  * ...) give ranges whose code points are all in the inversion list.  We say
8958  * that those ranges are in the set.  The odd-numbered elements give ranges
8959  * whose code points are not in the inversion list, and hence not in the set.
8960  * Thus, element [0] is the first code point in the list.  Element [1]
8961  * is the first code point beyond that not in the list; and element [2] is the
8962  * first code point beyond that that is in the list.  In other words, the first
8963  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8964  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8965  * all code points in that range are not in the inversion list.  The third
8966  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8967  * list, and so forth.  Thus every element whose index is divisible by two
8968  * gives the beginning of a range that is in the list, and every element whose
8969  * index is not divisible by two gives the beginning of a range not in the
8970  * list.  If the final element's index is divisible by two, the inversion list
8971  * extends to the platform's infinity; otherwise the highest code point in the
8972  * inversion list is the contents of that element minus 1.
8973  *
8974  * A range that contains just a single code point N will look like
8975  *  invlist[i]   == N
8976  *  invlist[i+1] == N+1
8977  *
8978  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8979  * impossible to represent, so element [i+1] is omitted.  The single element
8980  * inversion list
8981  *  invlist[0] == UV_MAX
8982  * contains just UV_MAX, but is interpreted as matching to infinity.
8983  *
8984  * Taking the complement (inverting) an inversion list is quite simple, if the
8985  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8986  * This implementation reserves an element at the beginning of each inversion
8987  * list to always contain 0; there is an additional flag in the header which
8988  * indicates if the list begins at the 0, or is offset to begin at the next
8989  * element.  This means that the inversion list can be inverted without any
8990  * copying; just flip the flag.
8991  *
8992  * More about inversion lists can be found in "Unicode Demystified"
8993  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8994  *
8995  * The inversion list data structure is currently implemented as an SV pointing
8996  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8997  * array of UV whose memory management is automatically handled by the existing
8998  * facilities for SV's.
8999  *
9000  * Some of the methods should always be private to the implementation, and some
9001  * should eventually be made public */
9002
9003 /* The header definitions are in F<invlist_inline.h> */
9004
9005 #ifndef PERL_IN_XSUB_RE
9006
9007 PERL_STATIC_INLINE UV*
9008 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9009 {
9010     /* Returns a pointer to the first element in the inversion list's array.
9011      * This is called upon initialization of an inversion list.  Where the
9012      * array begins depends on whether the list has the code point U+0000 in it
9013      * or not.  The other parameter tells it whether the code that follows this
9014      * call is about to put a 0 in the inversion list or not.  The first
9015      * element is either the element reserved for 0, if TRUE, or the element
9016      * after it, if FALSE */
9017
9018     bool* offset = get_invlist_offset_addr(invlist);
9019     UV* zero_addr = (UV *) SvPVX(invlist);
9020
9021     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9022
9023     /* Must be empty */
9024     assert(! _invlist_len(invlist));
9025
9026     *zero_addr = 0;
9027
9028     /* 1^1 = 0; 1^0 = 1 */
9029     *offset = 1 ^ will_have_0;
9030     return zero_addr + *offset;
9031 }
9032
9033 STATIC void
9034 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9035 {
9036     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9037      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9038      * is similar to what SvSetMagicSV() would do, if it were implemented on
9039      * inversion lists, though this routine avoids a copy */
9040
9041     const UV src_len          = _invlist_len(src);
9042     const bool src_offset     = *get_invlist_offset_addr(src);
9043     const STRLEN src_byte_len = SvLEN(src);
9044     char * array              = SvPVX(src);
9045
9046     const int oldtainted = TAINT_get;
9047
9048     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9049
9050     assert(is_invlist(src));
9051     assert(is_invlist(dest));
9052     assert(! invlist_is_iterating(src));
9053     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9054
9055     /* Make sure it ends in the right place with a NUL, as our inversion list
9056      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9057      * asserts it */
9058     array[src_byte_len - 1] = '\0';
9059
9060     TAINT_NOT;      /* Otherwise it breaks */
9061     sv_usepvn_flags(dest,
9062                     (char *) array,
9063                     src_byte_len - 1,
9064
9065                     /* This flag is documented to cause a copy to be avoided */
9066                     SV_HAS_TRAILING_NUL);
9067     TAINT_set(oldtainted);
9068     SvPV_set(src, 0);
9069     SvLEN_set(src, 0);
9070     SvCUR_set(src, 0);
9071
9072     /* Finish up copying over the other fields in an inversion list */
9073     *get_invlist_offset_addr(dest) = src_offset;
9074     invlist_set_len(dest, src_len, src_offset);
9075     *get_invlist_previous_index_addr(dest) = 0;
9076     invlist_iterfinish(dest);
9077 }
9078
9079 PERL_STATIC_INLINE IV*
9080 S_get_invlist_previous_index_addr(SV* invlist)
9081 {
9082     /* Return the address of the IV that is reserved to hold the cached index
9083      * */
9084     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9085
9086     assert(is_invlist(invlist));
9087
9088     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9089 }
9090
9091 PERL_STATIC_INLINE IV
9092 S_invlist_previous_index(SV* const invlist)
9093 {
9094     /* Returns cached index of previous search */
9095
9096     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9097
9098     return *get_invlist_previous_index_addr(invlist);
9099 }
9100
9101 PERL_STATIC_INLINE void
9102 S_invlist_set_previous_index(SV* const invlist, const IV index)
9103 {
9104     /* Caches <index> for later retrieval */
9105
9106     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9107
9108     assert(index == 0 || index < (int) _invlist_len(invlist));
9109
9110     *get_invlist_previous_index_addr(invlist) = index;
9111 }
9112
9113 PERL_STATIC_INLINE void
9114 S_invlist_trim(SV* invlist)
9115 {
9116     /* Free the not currently-being-used space in an inversion list */
9117
9118     /* But don't free up the space needed for the 0 UV that is always at the
9119      * beginning of the list, nor the trailing NUL */
9120     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9121
9122     PERL_ARGS_ASSERT_INVLIST_TRIM;
9123
9124     assert(is_invlist(invlist));
9125
9126     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9127 }
9128
9129 PERL_STATIC_INLINE void
9130 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9131 {
9132     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9133
9134     assert(is_invlist(invlist));
9135
9136     invlist_set_len(invlist, 0, 0);
9137     invlist_trim(invlist);
9138 }
9139
9140 #endif /* ifndef PERL_IN_XSUB_RE */
9141
9142 PERL_STATIC_INLINE bool
9143 S_invlist_is_iterating(SV* const invlist)
9144 {
9145     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9146
9147     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9148 }
9149
9150 #ifndef PERL_IN_XSUB_RE
9151
9152 PERL_STATIC_INLINE UV
9153 S_invlist_max(SV* const invlist)
9154 {
9155     /* Returns the maximum number of elements storable in the inversion list's
9156      * array, without having to realloc() */
9157
9158     PERL_ARGS_ASSERT_INVLIST_MAX;
9159
9160     assert(is_invlist(invlist));
9161
9162     /* Assumes worst case, in which the 0 element is not counted in the
9163      * inversion list, so subtracts 1 for that */
9164     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9165            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9166            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9167 }
9168
9169 STATIC void
9170 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9171 {
9172     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9173
9174     /* First 1 is in case the zero element isn't in the list; second 1 is for
9175      * trailing NUL */
9176     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9177     invlist_set_len(invlist, 0, 0);
9178
9179     /* Force iterinit() to be used to get iteration to work */
9180     invlist_iterfinish(invlist);
9181
9182     *get_invlist_previous_index_addr(invlist) = 0;
9183     SvPOK_on(invlist);  /* This allows B to extract the PV */
9184 }
9185
9186 SV*
9187 Perl__new_invlist(pTHX_ IV initial_size)
9188 {
9189
9190     /* Return a pointer to a newly constructed inversion list, with enough
9191      * space to store 'initial_size' elements.  If that number is negative, a
9192      * system default is used instead */
9193
9194     SV* new_list;
9195
9196     if (initial_size < 0) {
9197         initial_size = 10;
9198     }
9199
9200     new_list = newSV_type(SVt_INVLIST);
9201     initialize_invlist_guts(new_list, initial_size);
9202
9203     return new_list;
9204 }
9205
9206 SV*
9207 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9208 {
9209     /* Return a pointer to a newly constructed inversion list, initialized to
9210      * point to <list>, which has to be in the exact correct inversion list
9211      * form, including internal fields.  Thus this is a dangerous routine that
9212      * should not be used in the wrong hands.  The passed in 'list' contains
9213      * several header fields at the beginning that are not part of the
9214      * inversion list body proper */
9215
9216     const STRLEN length = (STRLEN) list[0];
9217     const UV version_id =          list[1];
9218     const bool offset   =    cBOOL(list[2]);
9219 #define HEADER_LENGTH 3
9220     /* If any of the above changes in any way, you must change HEADER_LENGTH
9221      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9222      *      perl -E 'say int(rand 2**31-1)'
9223      */
9224 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9225                                         data structure type, so that one being
9226                                         passed in can be validated to be an
9227                                         inversion list of the correct vintage.
9228                                        */
9229
9230     SV* invlist = newSV_type(SVt_INVLIST);
9231
9232     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9233
9234     if (version_id != INVLIST_VERSION_ID) {
9235         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9236     }
9237
9238     /* The generated array passed in includes header elements that aren't part
9239      * of the list proper, so start it just after them */
9240     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9241
9242     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9243                                shouldn't touch it */
9244
9245     *(get_invlist_offset_addr(invlist)) = offset;
9246
9247     /* The 'length' passed to us is the physical number of elements in the
9248      * inversion list.  But if there is an offset the logical number is one
9249      * less than that */
9250     invlist_set_len(invlist, length  - offset, offset);
9251
9252     invlist_set_previous_index(invlist, 0);
9253
9254     /* Initialize the iteration pointer. */
9255     invlist_iterfinish(invlist);
9256
9257     SvREADONLY_on(invlist);
9258     SvPOK_on(invlist);
9259
9260     return invlist;
9261 }
9262
9263 STATIC void
9264 S__append_range_to_invlist(pTHX_ SV* const invlist,
9265                                  const UV start, const UV end)
9266 {
9267    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9268     * the end of the inversion list.  The range must be above any existing
9269     * ones. */
9270
9271     UV* array;
9272     UV max = invlist_max(invlist);
9273     UV len = _invlist_len(invlist);
9274     bool offset;
9275
9276     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9277
9278     if (len == 0) { /* Empty lists must be initialized */
9279         offset = start != 0;
9280         array = _invlist_array_init(invlist, ! offset);
9281     }
9282     else {
9283         /* Here, the existing list is non-empty. The current max entry in the
9284          * list is generally the first value not in the set, except when the
9285          * set extends to the end of permissible values, in which case it is
9286          * the first entry in that final set, and so this call is an attempt to
9287          * append out-of-order */
9288
9289         UV final_element = len - 1;
9290         array = invlist_array(invlist);
9291         if (   array[final_element] > start
9292             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9293         {
9294             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9295                      array[final_element], start,
9296                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9297         }
9298
9299         /* Here, it is a legal append.  If the new range begins 1 above the end
9300          * of the range below it, it is extending the range below it, so the
9301          * new first value not in the set is one greater than the newly
9302          * extended range.  */
9303         offset = *get_invlist_offset_addr(invlist);
9304         if (array[final_element] == start) {
9305             if (end != UV_MAX) {
9306                 array[final_element] = end + 1;
9307             }
9308             else {
9309                 /* But if the end is the maximum representable on the machine,
9310                  * assume that infinity was actually what was meant.  Just let
9311                  * the range that this would extend to have no end */
9312                 invlist_set_len(invlist, len - 1, offset);
9313             }
9314             return;
9315         }
9316     }
9317
9318     /* Here the new range doesn't extend any existing set.  Add it */
9319
9320     len += 2;   /* Includes an element each for the start and end of range */
9321
9322     /* If wll overflow the existing space, extend, which may cause the array to
9323      * be moved */
9324     if (max < len) {
9325         invlist_extend(invlist, len);
9326
9327         /* Have to set len here to avoid assert failure in invlist_array() */
9328         invlist_set_len(invlist, len, offset);
9329
9330         array = invlist_array(invlist);
9331     }
9332     else {
9333         invlist_set_len(invlist, len, offset);
9334     }
9335
9336     /* The next item on the list starts the range, the one after that is
9337      * one past the new range.  */
9338     array[len - 2] = start;
9339     if (end != UV_MAX) {
9340         array[len - 1] = end + 1;
9341     }
9342     else {
9343         /* But if the end is the maximum representable on the machine, just let
9344          * the range have no end */
9345         invlist_set_len(invlist, len - 1, offset);
9346     }
9347 }
9348
9349 SSize_t
9350 Perl__invlist_search(SV* const invlist, const UV cp)
9351 {
9352     /* Searches the inversion list for the entry that contains the input code
9353      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9354      * return value is the index into the list's array of the range that
9355      * contains <cp>, that is, 'i' such that
9356      *  array[i] <= cp < array[i+1]
9357      */
9358
9359     IV low = 0;
9360     IV mid;
9361     IV high = _invlist_len(invlist);
9362     const IV highest_element = high - 1;
9363     const UV* array;
9364
9365     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9366
9367     /* If list is empty, return failure. */
9368     if (high == 0) {
9369         return -1;
9370     }
9371
9372     /* (We can't get the array unless we know the list is non-empty) */
9373     array = invlist_array(invlist);
9374
9375     mid = invlist_previous_index(invlist);
9376     assert(mid >=0);
9377     if (mid > highest_element) {
9378         mid = highest_element;
9379     }
9380
9381     /* <mid> contains the cache of the result of the previous call to this
9382      * function (0 the first time).  See if this call is for the same result,
9383      * or if it is for mid-1.  This is under the theory that calls to this
9384      * function will often be for related code points that are near each other.
9385      * And benchmarks show that caching gives better results.  We also test
9386      * here if the code point is within the bounds of the list.  These tests
9387      * replace others that would have had to be made anyway to make sure that
9388      * the array bounds were not exceeded, and these give us extra information
9389      * at the same time */
9390     if (cp >= array[mid]) {
9391         if (cp >= array[highest_element]) {
9392             return highest_element;
9393         }
9394
9395         /* Here, array[mid] <= cp < array[highest_element].  This means that
9396          * the final element is not the answer, so can exclude it; it also
9397          * means that <mid> is not the final element, so can refer to 'mid + 1'
9398          * safely */
9399         if (cp < array[mid + 1]) {
9400             return mid;
9401         }
9402         high--;
9403         low = mid + 1;
9404     }
9405     else { /* cp < aray[mid] */
9406         if (cp < array[0]) { /* Fail if outside the array */
9407             return -1;
9408         }
9409         high = mid;
9410         if (cp >= array[mid - 1]) {
9411             goto found_entry;
9412         }
9413     }
9414
9415     /* Binary search.  What we are looking for is <i> such that
9416      *  array[i] <= cp < array[i+1]
9417      * The loop below converges on the i+1.  Note that there may not be an
9418      * (i+1)th element in the array, and things work nonetheless */
9419     while (low < high) {
9420         mid = (low + high) / 2;
9421         assert(mid <= highest_element);
9422         if (array[mid] <= cp) { /* cp >= array[mid] */
9423             low = mid + 1;
9424
9425             /* We could do this extra test to exit the loop early.
9426             if (cp < array[low]) {
9427                 return mid;
9428             }
9429             */
9430         }
9431         else { /* cp < array[mid] */
9432             high = mid;
9433         }
9434     }
9435
9436   found_entry:
9437     high--;
9438     invlist_set_previous_index(invlist, high);
9439     return high;
9440 }
9441
9442 void
9443 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9444                                          const bool complement_b, SV** output)
9445 {
9446     /* Take the union of two inversion lists and point '*output' to it.  On
9447      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9448      * even 'a' or 'b').  If to an inversion list, the contents of the original
9449      * list will be replaced by the union.  The first list, 'a', may be
9450      * NULL, in which case a copy of the second list is placed in '*output'.
9451      * If 'complement_b' is TRUE, the union is taken of the complement
9452      * (inversion) of 'b' instead of b itself.
9453      *
9454      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9455      * Richard Gillam, published by Addison-Wesley, and explained at some
9456      * length there.  The preface says to incorporate its examples into your
9457      * code at your own risk.
9458      *
9459      * The algorithm is like a merge sort. */
9460
9461     const UV* array_a;    /* a's array */
9462     const UV* array_b;
9463     UV len_a;       /* length of a's array */
9464     UV len_b;
9465
9466     SV* u;                      /* the resulting union */
9467     UV* array_u;
9468     UV len_u = 0;
9469
9470     UV i_a = 0;             /* current index into a's array */
9471     UV i_b = 0;
9472     UV i_u = 0;
9473
9474     /* running count, as explained in the algorithm source book; items are
9475      * stopped accumulating and are output when the count changes to/from 0.
9476      * The count is incremented when we start a range that's in an input's set,
9477      * and decremented when we start a range that's not in a set.  So this
9478      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9479      * and hence nothing goes into the union; 1, just one of the inputs is in
9480      * its set (and its current range gets added to the union); and 2 when both
9481      * inputs are in their sets.  */
9482     UV count = 0;
9483
9484     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9485     assert(a != b);
9486     assert(*output == NULL || is_invlist(*output));
9487
9488     len_b = _invlist_len(b);
9489     if (len_b == 0) {
9490
9491         /* Here, 'b' is empty, hence it's complement is all possible code
9492          * points.  So if the union includes the complement of 'b', it includes
9493          * everything, and we need not even look at 'a'.  It's easiest to
9494          * create a new inversion list that matches everything.  */
9495         if (complement_b) {
9496             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9497
9498             if (*output == NULL) { /* If the output didn't exist, just point it
9499                                       at the new list */
9500                 *output = everything;
9501             }
9502             else { /* Otherwise, replace its contents with the new list */
9503                 invlist_replace_list_destroys_src(*output, everything);
9504                 SvREFCNT_dec_NN(everything);
9505             }
9506
9507             return;
9508         }
9509
9510         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9511          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9512          * output will be empty */
9513
9514         if (a == NULL || _invlist_len(a) == 0) {
9515             if (*output == NULL) {
9516                 *output = _new_invlist(0);
9517             }
9518             else {
9519                 invlist_clear(*output);
9520             }
9521             return;
9522         }
9523
9524         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9525          * union.  We can just return a copy of 'a' if '*output' doesn't point
9526          * to an existing list */
9527         if (*output == NULL) {
9528             *output = invlist_clone(a, NULL);
9529             return;
9530         }
9531
9532         /* If the output is to overwrite 'a', we have a no-op, as it's
9533          * already in 'a' */
9534         if (*output == a) {
9535             return;
9536         }
9537
9538         /* Here, '*output' is to be overwritten by 'a' */
9539         u = invlist_clone(a, NULL);
9540         invlist_replace_list_destroys_src(*output, u);
9541         SvREFCNT_dec_NN(u);
9542
9543         return;
9544     }
9545
9546     /* Here 'b' is not empty.  See about 'a' */
9547
9548     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9549
9550         /* Here, 'a' is empty (and b is not).  That means the union will come
9551          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9552          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9553          * the clone */
9554
9555         SV ** dest = (*output == NULL) ? output : &u;
9556         *dest = invlist_clone(b, NULL);
9557         if (complement_b) {
9558             _invlist_invert(*dest);
9559         }
9560
9561         if (dest == &u) {
9562             invlist_replace_list_destroys_src(*output, u);
9563             SvREFCNT_dec_NN(u);
9564         }
9565
9566         return;
9567     }
9568
9569     /* Here both lists exist and are non-empty */
9570     array_a = invlist_array(a);
9571     array_b = invlist_array(b);
9572
9573     /* If are to take the union of 'a' with the complement of b, set it
9574      * up so are looking at b's complement. */
9575     if (complement_b) {
9576
9577         /* To complement, we invert: if the first element is 0, remove it.  To
9578          * do this, we just pretend the array starts one later */
9579         if (array_b[0] == 0) {
9580             array_b++;
9581             len_b--;
9582         }
9583         else {
9584
9585             /* But if the first element is not zero, we pretend the list starts
9586              * at the 0 that is always stored immediately before the array. */
9587             array_b--;
9588             len_b++;
9589         }
9590     }
9591
9592     /* Size the union for the worst case: that the sets are completely
9593      * disjoint */
9594     u = _new_invlist(len_a + len_b);
9595
9596     /* Will contain U+0000 if either component does */
9597     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9598                                       || (len_b > 0 && array_b[0] == 0));
9599
9600     /* Go through each input list item by item, stopping when have exhausted
9601      * one of them */
9602     while (i_a < len_a && i_b < len_b) {
9603         UV cp;      /* The element to potentially add to the union's array */
9604         bool cp_in_set;   /* is it in the the input list's set or not */
9605
9606         /* We need to take one or the other of the two inputs for the union.
9607          * Since we are merging two sorted lists, we take the smaller of the
9608          * next items.  In case of a tie, we take first the one that is in its
9609          * set.  If we first took the one not in its set, it would decrement
9610          * the count, possibly to 0 which would cause it to be output as ending
9611          * the range, and the next time through we would take the same number,
9612          * and output it again as beginning the next range.  By doing it the
9613          * opposite way, there is no possibility that the count will be
9614          * momentarily decremented to 0, and thus the two adjoining ranges will
9615          * be seamlessly merged.  (In a tie and both are in the set or both not
9616          * in the set, it doesn't matter which we take first.) */
9617         if (       array_a[i_a] < array_b[i_b]
9618             || (   array_a[i_a] == array_b[i_b]
9619                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9620         {
9621             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9622             cp = array_a[i_a++];
9623         }
9624         else {
9625             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9626             cp = array_b[i_b++];
9627         }
9628
9629         /* Here, have chosen which of the two inputs to look at.  Only output
9630          * if the running count changes to/from 0, which marks the
9631          * beginning/end of a range that's in the set */
9632         if (cp_in_set) {
9633             if (count == 0) {
9634                 array_u[i_u++] = cp;
9635             }
9636             count++;
9637         }
9638         else {
9639             count--;
9640             if (count == 0) {
9641                 array_u[i_u++] = cp;
9642             }
9643         }
9644     }
9645
9646
9647     /* The loop above increments the index into exactly one of the input lists
9648      * each iteration, and ends when either index gets to its list end.  That
9649      * means the other index is lower than its end, and so something is
9650      * remaining in that one.  We decrement 'count', as explained below, if
9651      * that list is in its set.  (i_a and i_b each currently index the element
9652      * beyond the one we care about.) */
9653     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9654         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9655     {
9656         count--;
9657     }
9658
9659     /* Above we decremented 'count' if the list that had unexamined elements in
9660      * it was in its set.  This has made it so that 'count' being non-zero
9661      * means there isn't anything left to output; and 'count' equal to 0 means
9662      * that what is left to output is precisely that which is left in the
9663      * non-exhausted input list.
9664      *
9665      * To see why, note first that the exhausted input obviously has nothing
9666      * left to add to the union.  If it was in its set at its end, that means
9667      * the set extends from here to the platform's infinity, and hence so does
9668      * the union and the non-exhausted set is irrelevant.  The exhausted set
9669      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9670      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9671      * 'count' remains at 1.  This is consistent with the decremented 'count'
9672      * != 0 meaning there's nothing left to add to the union.
9673      *
9674      * But if the exhausted input wasn't in its set, it contributed 0 to
9675      * 'count', and the rest of the union will be whatever the other input is.
9676      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9677      * otherwise it gets decremented to 0.  This is consistent with 'count'
9678      * == 0 meaning the remainder of the union is whatever is left in the
9679      * non-exhausted list. */
9680     if (count != 0) {
9681         len_u = i_u;
9682     }
9683     else {
9684         IV copy_count = len_a - i_a;
9685         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9686             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9687         }
9688         else { /* The non-exhausted input is b */
9689             copy_count = len_b - i_b;
9690             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9691         }
9692         len_u = i_u + copy_count;
9693     }
9694
9695     /* Set the result to the final length, which can change the pointer to
9696      * array_u, so re-find it.  (Note that it is unlikely that this will
9697      * change, as we are shrinking the space, not enlarging it) */
9698     if (len_u != _invlist_len(u)) {
9699         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9700         invlist_trim(u);
9701         array_u = invlist_array(u);
9702     }
9703
9704     if (*output == NULL) {  /* Simply return the new inversion list */
9705         *output = u;
9706     }
9707     else {
9708         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9709          * could instead free '*output', and then set it to 'u', but experience
9710          * has shown [perl #127392] that if the input is a mortal, we can get a
9711          * huge build-up of these during regex compilation before they get
9712          * freed. */
9713         invlist_replace_list_destroys_src(*output, u);
9714         SvREFCNT_dec_NN(u);
9715     }
9716
9717     return;
9718 }
9719
9720 void
9721 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9722                                                const bool complement_b, SV** i)
9723 {
9724     /* Take the intersection of two inversion lists and point '*i' to it.  On
9725      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9726      * even 'a' or 'b').  If to an inversion list, the contents of the original
9727      * list will be replaced by the intersection.  The first list, 'a', may be
9728      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9729      * TRUE, the result will be the intersection of 'a' and the complement (or
9730      * inversion) of 'b' instead of 'b' directly.
9731      *
9732      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9733      * Richard Gillam, published by Addison-Wesley, and explained at some
9734      * length there.  The preface says to incorporate its examples into your
9735      * code at your own risk.  In fact, it had bugs
9736      *
9737      * The algorithm is like a merge sort, and is essentially the same as the
9738      * union above
9739      */
9740
9741     const UV* array_a;          /* a's array */
9742     const UV* array_b;
9743     UV len_a;   /* length of a's array */
9744     UV len_b;
9745
9746     SV* r;                   /* the resulting intersection */
9747     UV* array_r;
9748     UV len_r = 0;
9749
9750     UV i_a = 0;             /* current index into a's array */
9751     UV i_b = 0;
9752     UV i_r = 0;
9753
9754     /* running count of how many of the two inputs are postitioned at ranges
9755      * that are in their sets.  As explained in the algorithm source book,
9756      * items are stopped accumulating and are output when the count changes
9757      * to/from 2.  The count is incremented when we start a range that's in an
9758      * input's set, and decremented when we start a range that's not in a set.
9759      * Only when it is 2 are we in the intersection. */
9760     UV count = 0;
9761
9762     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9763     assert(a != b);
9764     assert(*i == NULL || is_invlist(*i));
9765
9766     /* Special case if either one is empty */
9767     len_a = (a == NULL) ? 0 : _invlist_len(a);
9768     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9769         if (len_a != 0 && complement_b) {
9770
9771             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9772              * must be empty.  Here, also we are using 'b's complement, which
9773              * hence must be every possible code point.  Thus the intersection
9774              * is simply 'a'. */
9775
9776             if (*i == a) {  /* No-op */
9777                 return;
9778             }
9779
9780             if (*i == NULL) {
9781                 *i = invlist_clone(a, NULL);
9782                 return;
9783             }
9784
9785             r = invlist_clone(a, NULL);
9786             invlist_replace_list_destroys_src(*i, r);
9787             SvREFCNT_dec_NN(r);
9788             return;
9789         }
9790
9791         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9792          * intersection must be empty */
9793         if (*i == NULL) {
9794             *i = _new_invlist(0);
9795             return;
9796         }
9797
9798         invlist_clear(*i);
9799         return;
9800     }
9801
9802     /* Here both lists exist and are non-empty */
9803     array_a = invlist_array(a);
9804     array_b = invlist_array(b);
9805
9806     /* If are to take the intersection of 'a' with the complement of b, set it
9807      * up so are looking at b's complement. */
9808     if (complement_b) {
9809
9810         /* To complement, we invert: if the first element is 0, remove it.  To
9811          * do this, we just pretend the array starts one later */
9812         if (array_b[0] == 0) {
9813             array_b++;
9814             len_b--;
9815         }
9816         else {
9817
9818             /* But if the first element is not zero, we pretend the list starts
9819              * at the 0 that is always stored immediately before the array. */
9820             array_b--;
9821             len_b++;
9822         }
9823     }
9824
9825     /* Size the intersection for the worst case: that the intersection ends up
9826      * fragmenting everything to be completely disjoint */
9827     r= _new_invlist(len_a + len_b);
9828
9829     /* Will contain U+0000 iff both components do */
9830     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9831                                      && len_b > 0 && array_b[0] == 0);
9832
9833     /* Go through each list item by item, stopping when have exhausted one of
9834      * them */
9835     while (i_a < len_a && i_b < len_b) {
9836         UV cp;      /* The element to potentially add to the intersection's
9837                        array */
9838         bool cp_in_set; /* Is it in the input list's set or not */
9839
9840         /* We need to take one or the other of the two inputs for the
9841          * intersection.  Since we are merging two sorted lists, we take the
9842          * smaller of the next items.  In case of a tie, we take first the one
9843          * that is not in its set (a difference from the union algorithm).  If
9844          * we first took the one in its set, it would increment the count,
9845          * possibly to 2 which would cause it to be output as starting a range
9846          * in the intersection, and the next time through we would take that
9847          * same number, and output it again as ending the set.  By doing the
9848          * opposite of this, there is no possibility that the count will be
9849          * momentarily incremented to 2.  (In a tie and both are in the set or
9850          * both not in the set, it doesn't matter which we take first.) */
9851         if (       array_a[i_a] < array_b[i_b]
9852             || (   array_a[i_a] == array_b[i_b]
9853                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9854         {
9855             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9856             cp = array_a[i_a++];
9857         }
9858         else {
9859             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9860             cp= array_b[i_b++];
9861         }
9862
9863         /* Here, have chosen which of the two inputs to look at.  Only output
9864          * if the running count changes to/from 2, which marks the
9865          * beginning/end of a range that's in the intersection */
9866         if (cp_in_set) {
9867             count++;
9868             if (count == 2) {
9869                 array_r[i_r++] = cp;
9870             }
9871         }
9872         else {
9873             if (count == 2) {
9874                 array_r[i_r++] = cp;
9875             }
9876             count--;
9877         }
9878
9879     }
9880
9881     /* The loop above increments the index into exactly one of the input lists
9882      * each iteration, and ends when either index gets to its list end.  That
9883      * means the other index is lower than its end, and so something is
9884      * remaining in that one.  We increment 'count', as explained below, if the
9885      * exhausted list was in its set.  (i_a and i_b each currently index the
9886      * element beyond the one we care about.) */
9887     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9888         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9889     {
9890         count++;
9891     }
9892
9893     /* Above we incremented 'count' if the exhausted list was in its set.  This
9894      * has made it so that 'count' being below 2 means there is nothing left to
9895      * output; otheriwse what's left to add to the intersection is precisely
9896      * that which is left in the non-exhausted input list.
9897      *
9898      * To see why, note first that the exhausted input obviously has nothing
9899      * left to affect the intersection.  If it was in its set at its end, that
9900      * means the set extends from here to the platform's infinity, and hence
9901      * anything in the non-exhausted's list will be in the intersection, and
9902      * anything not in it won't be.  Hence, the rest of the intersection is
9903      * precisely what's in the non-exhausted list  The exhausted set also
9904      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9905      * it means 'count' is now at least 2.  This is consistent with the
9906      * incremented 'count' being >= 2 means to add the non-exhausted list to
9907      * the intersection.
9908      *
9909      * But if the exhausted input wasn't in its set, it contributed 0 to
9910      * 'count', and the intersection can't include anything further; the
9911      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9912      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9913      * further to add to the intersection. */
9914     if (count < 2) { /* Nothing left to put in the intersection. */
9915         len_r = i_r;
9916     }
9917     else { /* copy the non-exhausted list, unchanged. */
9918         IV copy_count = len_a - i_a;
9919         if (copy_count > 0) {   /* a is the one with stuff left */
9920             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9921         }
9922         else {  /* b is the one with stuff left */
9923             copy_count = len_b - i_b;
9924             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9925         }
9926         len_r = i_r + copy_count;
9927     }
9928
9929     /* Set the result to the final length, which can change the pointer to
9930      * array_r, so re-find it.  (Note that it is unlikely that this will
9931      * change, as we are shrinking the space, not enlarging it) */
9932     if (len_r != _invlist_len(r)) {
9933         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9934         invlist_trim(r);
9935         array_r = invlist_array(r);
9936     }
9937
9938     if (*i == NULL) { /* Simply return the calculated intersection */
9939         *i = r;
9940     }
9941     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9942               instead free '*i', and then set it to 'r', but experience has
9943               shown [perl #127392] that if the input is a mortal, we can get a
9944               huge build-up of these during regex compilation before they get
9945               freed. */
9946         if (len_r) {
9947             invlist_replace_list_destroys_src(*i, r);
9948         }
9949         else {
9950             invlist_clear(*i);
9951         }
9952         SvREFCNT_dec_NN(r);
9953     }
9954
9955     return;
9956 }
9957
9958 SV*
9959 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9960 {
9961     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9962      * set.  A pointer to the inversion list is returned.  This may actually be
9963      * a new list, in which case the passed in one has been destroyed.  The
9964      * passed-in inversion list can be NULL, in which case a new one is created
9965      * with just the one range in it.  The new list is not necessarily
9966      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9967      * result of this function.  The gain would not be large, and in many
9968      * cases, this is called multiple times on a single inversion list, so
9969      * anything freed may almost immediately be needed again.
9970      *
9971      * This used to mostly call the 'union' routine, but that is much more
9972      * heavyweight than really needed for a single range addition */
9973
9974     UV* array;              /* The array implementing the inversion list */
9975     UV len;                 /* How many elements in 'array' */
9976     SSize_t i_s;            /* index into the invlist array where 'start'
9977                                should go */
9978     SSize_t i_e = 0;        /* And the index where 'end' should go */
9979     UV cur_highest;         /* The highest code point in the inversion list
9980                                upon entry to this function */
9981
9982     /* This range becomes the whole inversion list if none already existed */
9983     if (invlist == NULL) {
9984         invlist = _new_invlist(2);
9985         _append_range_to_invlist(invlist, start, end);
9986         return invlist;
9987     }
9988
9989     /* Likewise, if the inversion list is currently empty */
9990     len = _invlist_len(invlist);
9991     if (len == 0) {
9992         _append_range_to_invlist(invlist, start, end);
9993         return invlist;
9994     }
9995
9996     /* Starting here, we have to know the internals of the list */
9997     array = invlist_array(invlist);
9998
9999     /* If the new range ends higher than the current highest ... */
10000     cur_highest = invlist_highest(invlist);
10001     if (end > cur_highest) {
10002
10003         /* If the whole range is higher, we can just append it */
10004         if (start > cur_highest) {
10005             _append_range_to_invlist(invlist, start, end);
10006             return invlist;
10007         }
10008
10009         /* Otherwise, add the portion that is higher ... */
10010         _append_range_to_invlist(invlist, cur_highest + 1, end);
10011
10012         /* ... and continue on below to handle the rest.  As a result of the
10013          * above append, we know that the index of the end of the range is the
10014          * final even numbered one of the array.  Recall that the final element
10015          * always starts a range that extends to infinity.  If that range is in
10016          * the set (meaning the set goes from here to infinity), it will be an
10017          * even index, but if it isn't in the set, it's odd, and the final
10018          * range in the set is one less, which is even. */
10019         if (end == UV_MAX) {
10020             i_e = len;
10021         }
10022         else {
10023             i_e = len - 2;
10024         }
10025     }
10026
10027     /* We have dealt with appending, now see about prepending.  If the new
10028      * range starts lower than the current lowest ... */
10029     if (start < array[0]) {
10030
10031         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10032          * Let the union code handle it, rather than having to know the
10033          * trickiness in two code places.  */
10034         if (UNLIKELY(start == 0)) {
10035             SV* range_invlist;
10036
10037             range_invlist = _new_invlist(2);
10038             _append_range_to_invlist(range_invlist, start, end);
10039
10040             _invlist_union(invlist, range_invlist, &invlist);
10041
10042             SvREFCNT_dec_NN(range_invlist);
10043
10044             return invlist;
10045         }
10046
10047         /* If the whole new range comes before the first entry, and doesn't
10048          * extend it, we have to insert it as an additional range */
10049         if (end < array[0] - 1) {
10050             i_s = i_e = -1;
10051             goto splice_in_new_range;
10052         }
10053
10054         /* Here the new range adjoins the existing first range, extending it
10055          * downwards. */
10056         array[0] = start;
10057
10058         /* And continue on below to handle the rest.  We know that the index of
10059          * the beginning of the range is the first one of the array */
10060         i_s = 0;
10061     }
10062     else { /* Not prepending any part of the new range to the existing list.
10063             * Find where in the list it should go.  This finds i_s, such that:
10064             *     invlist[i_s] <= start < array[i_s+1]
10065             */
10066         i_s = _invlist_search(invlist, start);
10067     }
10068
10069     /* At this point, any extending before the beginning of the inversion list
10070      * and/or after the end has been done.  This has made it so that, in the
10071      * code below, each endpoint of the new range is either in a range that is
10072      * in the set, or is in a gap between two ranges that are.  This means we
10073      * don't have to worry about exceeding the array bounds.
10074      *
10075      * Find where in the list the new range ends (but we can skip this if we
10076      * have already determined what it is, or if it will be the same as i_s,
10077      * which we already have computed) */
10078     if (i_e == 0) {
10079         i_e = (start == end)
10080               ? i_s
10081               : _invlist_search(invlist, end);
10082     }
10083
10084     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10085      * is a range that goes to infinity there is no element at invlist[i_e+1],
10086      * so only the first relation holds. */
10087
10088     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10089
10090         /* Here, the ranges on either side of the beginning of the new range
10091          * are in the set, and this range starts in the gap between them.
10092          *
10093          * The new range extends the range above it downwards if the new range
10094          * ends at or above that range's start */
10095         const bool extends_the_range_above = (   end == UV_MAX
10096                                               || end + 1 >= array[i_s+1]);
10097
10098         /* The new range extends the range below it upwards if it begins just
10099          * after where that range ends */
10100         if (start == array[i_s]) {
10101
10102             /* If the new range fills the entire gap between the other ranges,
10103              * they will get merged together.  Other ranges may also get
10104              * merged, depending on how many of them the new range spans.  In
10105              * the general case, we do the merge later, just once, after we
10106              * figure out how many to merge.  But in the case where the new
10107              * range exactly spans just this one gap (possibly extending into
10108              * the one above), we do the merge here, and an early exit.  This
10109              * is done here to avoid having to special case later. */
10110             if (i_e - i_s <= 1) {
10111
10112                 /* If i_e - i_s == 1, it means that the new range terminates
10113                  * within the range above, and hence 'extends_the_range_above'
10114                  * must be true.  (If the range above it extends to infinity,
10115                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10116                  * will be 0, so no harm done.) */
10117                 if (extends_the_range_above) {
10118                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10119                     invlist_set_len(invlist,
10120                                     len - 2,
10121                                     *(get_invlist_offset_addr(invlist)));
10122                     return invlist;
10123                 }
10124
10125                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10126                  * to the same range, and below we are about to decrement i_s
10127                  * */
10128                 i_e--;
10129             }
10130
10131             /* Here, the new range is adjacent to the one below.  (It may also
10132              * span beyond the range above, but that will get resolved later.)
10133              * Extend the range below to include this one. */
10134             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10135             i_s--;
10136             start = array[i_s];
10137         }
10138         else if (extends_the_range_above) {
10139
10140             /* Here the new range only extends the range above it, but not the
10141              * one below.  It merges with the one above.  Again, we keep i_e
10142              * and i_s in sync if they point to the same range */
10143             if (i_e == i_s) {
10144                 i_e++;
10145             }
10146             i_s++;
10147             array[i_s] = start;
10148         }
10149     }
10150
10151     /* Here, we've dealt with the new range start extending any adjoining
10152      * existing ranges.
10153      *
10154      * If the new range extends to infinity, it is now the final one,
10155      * regardless of what was there before */
10156     if (UNLIKELY(end == UV_MAX)) {
10157         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10158         return invlist;
10159     }
10160
10161     /* If i_e started as == i_s, it has also been dealt with,
10162      * and been updated to the new i_s, which will fail the following if */
10163     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10164
10165         /* Here, the ranges on either side of the end of the new range are in
10166          * the set, and this range ends in the gap between them.
10167          *
10168          * If this range is adjacent to (hence extends) the range above it, it
10169          * becomes part of that range; likewise if it extends the range below,
10170          * it becomes part of that range */
10171         if (end + 1 == array[i_e+1]) {
10172             i_e++;
10173             array[i_e] = start;
10174         }
10175         else if (start <= array[i_e]) {
10176             array[i_e] = end + 1;
10177             i_e--;
10178         }
10179     }
10180
10181     if (i_s == i_e) {
10182
10183         /* If the range fits entirely in an existing range (as possibly already
10184          * extended above), it doesn't add anything new */
10185         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10186             return invlist;
10187         }
10188
10189         /* Here, no part of the range is in the list.  Must add it.  It will
10190          * occupy 2 more slots */
10191       splice_in_new_range:
10192
10193         invlist_extend(invlist, len + 2);
10194         array = invlist_array(invlist);
10195         /* Move the rest of the array down two slots. Don't include any
10196          * trailing NUL */
10197         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10198
10199         /* Do the actual splice */
10200         array[i_e+1] = start;
10201         array[i_e+2] = end + 1;
10202         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10203         return invlist;
10204     }
10205
10206     /* Here the new range crossed the boundaries of a pre-existing range.  The
10207      * code above has adjusted things so that both ends are in ranges that are
10208      * in the set.  This means everything in between must also be in the set.
10209      * Just squash things together */
10210     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10211     invlist_set_len(invlist,
10212                     len - i_e + i_s,
10213                     *(get_invlist_offset_addr(invlist)));
10214
10215     return invlist;
10216 }
10217
10218 SV*
10219 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10220                                  UV** other_elements_ptr)
10221 {
10222     /* Create and return an inversion list whose contents are to be populated
10223      * by the caller.  The caller gives the number of elements (in 'size') and
10224      * the very first element ('element0').  This function will set
10225      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10226      * are to be placed.
10227      *
10228      * Obviously there is some trust involved that the caller will properly
10229      * fill in the other elements of the array.
10230      *
10231      * (The first element needs to be passed in, as the underlying code does
10232      * things differently depending on whether it is zero or non-zero) */
10233
10234     SV* invlist = _new_invlist(size);
10235     bool offset;
10236
10237     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10238
10239     invlist = add_cp_to_invlist(invlist, element0);
10240     offset = *get_invlist_offset_addr(invlist);
10241
10242     invlist_set_len(invlist, size, offset);
10243     *other_elements_ptr = invlist_array(invlist) + 1;
10244     return invlist;
10245 }
10246
10247 #endif
10248
10249 #ifndef PERL_IN_XSUB_RE
10250 void
10251 Perl__invlist_invert(pTHX_ SV* const invlist)
10252 {
10253     /* Complement the input inversion list.  This adds a 0 if the list didn't
10254      * have a zero; removes it otherwise.  As described above, the data
10255      * structure is set up so that this is very efficient */
10256
10257     PERL_ARGS_ASSERT__INVLIST_INVERT;
10258
10259     assert(! invlist_is_iterating(invlist));
10260
10261     /* The inverse of matching nothing is matching everything */
10262     if (_invlist_len(invlist) == 0) {
10263         _append_range_to_invlist(invlist, 0, UV_MAX);
10264         return;
10265     }
10266
10267     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10268 }
10269
10270 SV*
10271 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10272 {
10273     /* Return a new inversion list that is a copy of the input one, which is
10274      * unchanged.  The new list will not be mortal even if the old one was. */
10275
10276     const STRLEN nominal_length = _invlist_len(invlist);
10277     const STRLEN physical_length = SvCUR(invlist);
10278     const bool offset = *(get_invlist_offset_addr(invlist));
10279
10280     PERL_ARGS_ASSERT_INVLIST_CLONE;
10281
10282     if (new_invlist == NULL) {
10283         new_invlist = _new_invlist(nominal_length);
10284     }
10285     else {
10286         sv_upgrade(new_invlist, SVt_INVLIST);
10287         initialize_invlist_guts(new_invlist, nominal_length);
10288     }
10289
10290     *(get_invlist_offset_addr(new_invlist)) = offset;
10291     invlist_set_len(new_invlist, nominal_length, offset);
10292     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10293
10294     return new_invlist;
10295 }
10296
10297 #endif
10298
10299 PERL_STATIC_INLINE UV
10300 S_invlist_lowest(SV* const invlist)
10301 {
10302     /* Returns the lowest code point that matches an inversion list.  This API
10303      * has an ambiguity, as it returns 0 under either the lowest is actually
10304      * 0, or if the list is empty.  If this distinction matters to you, check
10305      * for emptiness before calling this function */
10306
10307     UV len = _invlist_len(invlist);
10308     UV *array;
10309
10310     PERL_ARGS_ASSERT_INVLIST_LOWEST;
10311
10312     if (len == 0) {
10313         return 0;
10314     }
10315
10316     array = invlist_array(invlist);
10317
10318     return array[0];
10319 }
10320
10321 STATIC SV *
10322 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10323 {
10324     /* Get the contents of an inversion list into a string SV so that they can
10325      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10326      * traditionally done for debug tracing; otherwise it uses a format
10327      * suitable for just copying to the output, with blanks between ranges and
10328      * a dash between range components */
10329
10330     UV start, end;
10331     SV* output;
10332     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10333     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10334
10335     if (traditional_style) {
10336         output = newSVpvs("\n");
10337     }
10338     else {
10339         output = newSVpvs("");
10340     }
10341
10342     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10343
10344     assert(! invlist_is_iterating(invlist));
10345
10346     invlist_iterinit(invlist);
10347     while (invlist_iternext(invlist, &start, &end)) {
10348         if (end == UV_MAX) {
10349             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10350                                           start, intra_range_delimiter,
10351                                                  inter_range_delimiter);
10352         }
10353         else if (end != start) {
10354             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10355                                           start,
10356                                                    intra_range_delimiter,
10357                                                   end, inter_range_delimiter);
10358         }
10359         else {
10360             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10361                                           start, inter_range_delimiter);
10362         }
10363     }
10364
10365     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10366         SvCUR_set(output, SvCUR(output) - 1);
10367     }
10368
10369     return output;
10370 }
10371
10372 #ifndef PERL_IN_XSUB_RE
10373 void
10374 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10375                          const char * const indent, SV* const invlist)
10376 {
10377     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10378      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10379      * the string 'indent'.  The output looks like this:
10380          [0] 0x000A .. 0x000D
10381          [2] 0x0085
10382          [4] 0x2028 .. 0x2029
10383          [6] 0x3104 .. INFTY
10384      * This means that the first range of code points matched by the list are
10385      * 0xA through 0xD; the second range contains only the single code point
10386      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10387      * are used to define each range (except if the final range extends to
10388      * infinity, only a single element is needed).  The array index of the
10389      * first element for the corresponding range is given in brackets. */
10390
10391     UV start, end;
10392     STRLEN count = 0;
10393
10394     PERL_ARGS_ASSERT__INVLIST_DUMP;
10395
10396     if (invlist_is_iterating(invlist)) {
10397         Perl_dump_indent(aTHX_ level, file,
10398              "%sCan't dump inversion list because is in middle of iterating\n",
10399              indent);
10400         return;
10401     }
10402
10403     invlist_iterinit(invlist);
10404     while (invlist_iternext(invlist, &start, &end)) {
10405         if (end == UV_MAX) {
10406             Perl_dump_indent(aTHX_ level, file,
10407                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10408                                    indent, (UV)count, start);
10409         }
10410         else if (end != start) {
10411             Perl_dump_indent(aTHX_ level, file,
10412                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10413                                 indent, (UV)count, start,         end);
10414         }
10415         else {
10416             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10417                                             indent, (UV)count, start);
10418         }
10419         count += 2;
10420     }
10421 }
10422
10423 #endif
10424
10425 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10426 bool
10427 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10428 {
10429     /* Return a boolean as to if the two passed in inversion lists are
10430      * identical.  The final argument, if TRUE, says to take the complement of
10431      * the second inversion list before doing the comparison */
10432
10433     const UV len_a = _invlist_len(a);
10434     UV len_b = _invlist_len(b);
10435
10436     const UV* array_a = NULL;
10437     const UV* array_b = NULL;
10438
10439     PERL_ARGS_ASSERT__INVLISTEQ;
10440
10441     /* This code avoids accessing the arrays unless it knows the length is
10442      * non-zero */
10443
10444     if (len_a == 0) {
10445         if (len_b == 0) {
10446             return ! complement_b;
10447         }
10448     }
10449     else {
10450         array_a = invlist_array(a);
10451     }
10452
10453     if (len_b != 0) {
10454         array_b = invlist_array(b);
10455     }
10456
10457     /* If are to compare 'a' with the complement of b, set it
10458      * up so are looking at b's complement. */
10459     if (complement_b) {
10460
10461         /* The complement of nothing is everything, so <a> would have to have
10462          * just one element, starting at zero (ending at infinity) */
10463         if (len_b == 0) {
10464             return (len_a == 1 && array_a[0] == 0);
10465         }
10466         if (array_b[0] == 0) {
10467
10468             /* Otherwise, to complement, we invert.  Here, the first element is
10469              * 0, just remove it.  To do this, we just pretend the array starts
10470              * one later */
10471
10472             array_b++;
10473             len_b--;
10474         }
10475         else {
10476
10477             /* But if the first element is not zero, we pretend the list starts
10478              * at the 0 that is always stored immediately before the array. */
10479             array_b--;
10480             len_b++;
10481         }
10482     }
10483
10484     return    len_a == len_b
10485            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10486
10487 }
10488 #endif
10489
10490 /*
10491  * As best we can, determine the characters that can match the start of
10492  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10493  * can be false positive matches
10494  *
10495  * Returns the invlist as a new SV*; it is the caller's responsibility to
10496  * call SvREFCNT_dec() when done with it.
10497  */
10498 STATIC SV*
10499 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10500 {
10501     dVAR;
10502     const U8 * s = (U8*)STRING(node);
10503     SSize_t bytelen = STR_LEN(node);
10504     UV uc;
10505     /* Start out big enough for 2 separate code points */
10506     SV* invlist = _new_invlist(4);
10507
10508     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10509
10510     if (! UTF) {
10511         uc = *s;
10512
10513         /* We punt and assume can match anything if the node begins
10514          * with a multi-character fold.  Things are complicated.  For
10515          * example, /ffi/i could match any of:
10516          *  "\N{LATIN SMALL LIGATURE FFI}"
10517          *  "\N{LATIN SMALL LIGATURE FF}I"
10518          *  "F\N{LATIN SMALL LIGATURE FI}"
10519          *  plus several other things; and making sure we have all the
10520          *  possibilities is hard. */
10521         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10522             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10523         }
10524         else {
10525             /* Any Latin1 range character can potentially match any
10526              * other depending on the locale, and in Turkic locales, U+130 and
10527              * U+131 */
10528             if (OP(node) == EXACTFL) {
10529                 _invlist_union(invlist, PL_Latin1, &invlist);
10530                 invlist = add_cp_to_invlist(invlist,
10531                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10532                 invlist = add_cp_to_invlist(invlist,
10533                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10534             }
10535             else {
10536                 /* But otherwise, it matches at least itself.  We can
10537                  * quickly tell if it has a distinct fold, and if so,
10538                  * it matches that as well */
10539                 invlist = add_cp_to_invlist(invlist, uc);
10540                 if (IS_IN_SOME_FOLD_L1(uc))
10541                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10542             }
10543
10544             /* Some characters match above-Latin1 ones under /i.  This
10545              * is true of EXACTFL ones when the locale is UTF-8 */
10546             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10547                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10548                                     && OP(node) != EXACTFAA_NO_TRIE)))
10549             {
10550                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10551             }
10552         }
10553     }
10554     else {  /* Pattern is UTF-8 */
10555         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10556         const U8* e = s + bytelen;
10557         IV fc;
10558
10559         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10560
10561         /* The only code points that aren't folded in a UTF EXACTFish
10562          * node are are the problematic ones in EXACTFL nodes */
10563         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10564             /* We need to check for the possibility that this EXACTFL
10565              * node begins with a multi-char fold.  Therefore we fold
10566              * the first few characters of it so that we can make that
10567              * check */
10568             U8 *d = folded;
10569             int i;
10570
10571             fc = -1;
10572             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10573                 if (isASCII(*s)) {
10574                     *(d++) = (U8) toFOLD(*s);
10575                     if (fc < 0) {       /* Save the first fold */
10576                         fc = *(d-1);
10577                     }
10578                     s++;
10579                 }
10580                 else {
10581                     STRLEN len;
10582                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10583                     if (fc < 0) {       /* Save the first fold */
10584                         fc = fold;
10585                     }
10586                     d += len;
10587                     s += UTF8SKIP(s);
10588                 }
10589             }
10590
10591             /* And set up so the code below that looks in this folded
10592              * buffer instead of the node's string */
10593             e = d;
10594             s = folded;
10595         }
10596
10597         /* When we reach here 's' points to the fold of the first
10598          * character(s) of the node; and 'e' points to far enough along
10599          * the folded string to be just past any possible multi-char
10600          * fold.
10601          *
10602          * Unlike the non-UTF-8 case, the macro for determining if a
10603          * string is a multi-char fold requires all the characters to
10604          * already be folded.  This is because of all the complications
10605          * if not.  Note that they are folded anyway, except in EXACTFL
10606          * nodes.  Like the non-UTF case above, we punt if the node
10607          * begins with a multi-char fold  */
10608
10609         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10610             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10611         }
10612         else {  /* Single char fold */
10613             unsigned int k;
10614             unsigned int first_fold;
10615             const unsigned int * remaining_folds;
10616             Size_t folds_count;
10617
10618             /* It matches itself */
10619             invlist = add_cp_to_invlist(invlist, fc);
10620
10621             /* ... plus all the things that fold to it, which are found in
10622              * PL_utf8_foldclosures */
10623             folds_count = _inverse_folds(fc, &first_fold,
10624                                                 &remaining_folds);
10625             for (k = 0; k < folds_count; k++) {
10626                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10627
10628                 /* /aa doesn't allow folds between ASCII and non- */
10629                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10630                     && isASCII(c) != isASCII(fc))
10631                 {
10632                     continue;
10633                 }
10634
10635                 invlist = add_cp_to_invlist(invlist, c);
10636             }
10637
10638             if (OP(node) == EXACTFL) {
10639
10640                 /* If either [iI] are present in an EXACTFL node the above code
10641                  * should have added its normal case pair, but under a Turkish
10642                  * locale they could match instead the case pairs from it.  Add
10643                  * those as potential matches as well */
10644                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10645                     invlist = add_cp_to_invlist(invlist,
10646                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10647                     invlist = add_cp_to_invlist(invlist,
10648                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10649                 }
10650                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10651                     invlist = add_cp_to_invlist(invlist, 'I');
10652                 }
10653                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10654                     invlist = add_cp_to_invlist(invlist, 'i');
10655                 }
10656             }
10657         }
10658     }
10659
10660     return invlist;
10661 }
10662
10663 #undef HEADER_LENGTH
10664 #undef TO_INTERNAL_SIZE
10665 #undef FROM_INTERNAL_SIZE
10666 #undef INVLIST_VERSION_ID
10667
10668 /* End of inversion list object */
10669
10670 STATIC void
10671 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10672 {
10673     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10674      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10675      * should point to the first flag; it is updated on output to point to the
10676      * final ')' or ':'.  There needs to be at least one flag, or this will
10677      * abort */
10678
10679     /* for (?g), (?gc), and (?o) warnings; warning
10680        about (?c) will warn about (?g) -- japhy    */
10681
10682 #define WASTED_O  0x01
10683 #define WASTED_G  0x02
10684 #define WASTED_C  0x04
10685 #define WASTED_GC (WASTED_G|WASTED_C)
10686     I32 wastedflags = 0x00;
10687     U32 posflags = 0, negflags = 0;
10688     U32 *flagsp = &posflags;
10689     char has_charset_modifier = '\0';
10690     regex_charset cs;
10691     bool has_use_defaults = FALSE;
10692     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10693     int x_mod_count = 0;
10694
10695     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10696
10697     /* '^' as an initial flag sets certain defaults */
10698     if (UCHARAT(RExC_parse) == '^') {
10699         RExC_parse++;
10700         has_use_defaults = TRUE;
10701         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10702         cs = (RExC_uni_semantics)
10703              ? REGEX_UNICODE_CHARSET
10704              : REGEX_DEPENDS_CHARSET;
10705         set_regex_charset(&RExC_flags, cs);
10706     }
10707     else {
10708         cs = get_regex_charset(RExC_flags);
10709         if (   cs == REGEX_DEPENDS_CHARSET
10710             && RExC_uni_semantics)
10711         {
10712             cs = REGEX_UNICODE_CHARSET;
10713         }
10714     }
10715
10716     while (RExC_parse < RExC_end) {
10717         /* && strchr("iogcmsx", *RExC_parse) */
10718         /* (?g), (?gc) and (?o) are useless here
10719            and must be globally applied -- japhy */
10720         switch (*RExC_parse) {
10721
10722             /* Code for the imsxn flags */
10723             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10724
10725             case LOCALE_PAT_MOD:
10726                 if (has_charset_modifier) {
10727                     goto excess_modifier;
10728                 }
10729                 else if (flagsp == &negflags) {
10730                     goto neg_modifier;
10731                 }
10732                 cs = REGEX_LOCALE_CHARSET;
10733                 has_charset_modifier = LOCALE_PAT_MOD;
10734                 break;
10735             case UNICODE_PAT_MOD:
10736                 if (has_charset_modifier) {
10737                     goto excess_modifier;
10738                 }
10739                 else if (flagsp == &negflags) {
10740                     goto neg_modifier;
10741                 }
10742                 cs = REGEX_UNICODE_CHARSET;
10743                 has_charset_modifier = UNICODE_PAT_MOD;
10744                 break;
10745             case ASCII_RESTRICT_PAT_MOD:
10746                 if (flagsp == &negflags) {
10747                     goto neg_modifier;
10748                 }
10749                 if (has_charset_modifier) {
10750                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10751                         goto excess_modifier;
10752                     }
10753                     /* Doubled modifier implies more restricted */
10754                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10755                 }
10756                 else {
10757                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10758                 }
10759                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10760                 break;
10761             case DEPENDS_PAT_MOD:
10762                 if (has_use_defaults) {
10763                     goto fail_modifiers;
10764                 }
10765                 else if (flagsp == &negflags) {
10766                     goto neg_modifier;
10767                 }
10768                 else if (has_charset_modifier) {
10769                     goto excess_modifier;
10770                 }
10771
10772                 /* The dual charset means unicode semantics if the
10773                  * pattern (or target, not known until runtime) are
10774                  * utf8, or something in the pattern indicates unicode
10775                  * semantics */
10776                 cs = (RExC_uni_semantics)
10777                      ? REGEX_UNICODE_CHARSET
10778                      : REGEX_DEPENDS_CHARSET;
10779                 has_charset_modifier = DEPENDS_PAT_MOD;
10780                 break;
10781               excess_modifier:
10782                 RExC_parse++;
10783                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10784                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10785                 }
10786                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10787                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10788                                         *(RExC_parse - 1));
10789                 }
10790                 else {
10791                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10792                 }
10793                 NOT_REACHED; /*NOTREACHED*/
10794               neg_modifier:
10795                 RExC_parse++;
10796                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10797                                     *(RExC_parse - 1));
10798                 NOT_REACHED; /*NOTREACHED*/
10799             case ONCE_PAT_MOD: /* 'o' */
10800             case GLOBAL_PAT_MOD: /* 'g' */
10801                 if (ckWARN(WARN_REGEXP)) {
10802                     const I32 wflagbit = *RExC_parse == 'o'
10803                                          ? WASTED_O
10804                                          : WASTED_G;
10805                     if (! (wastedflags & wflagbit) ) {
10806                         wastedflags |= wflagbit;
10807                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10808                         vWARN5(
10809                             RExC_parse + 1,
10810                             "Useless (%s%c) - %suse /%c modifier",
10811                             flagsp == &negflags ? "?-" : "?",
10812                             *RExC_parse,
10813                             flagsp == &negflags ? "don't " : "",
10814                             *RExC_parse
10815                         );
10816                     }
10817                 }
10818                 break;
10819
10820             case CONTINUE_PAT_MOD: /* 'c' */
10821                 if (ckWARN(WARN_REGEXP)) {
10822                     if (! (wastedflags & WASTED_C) ) {
10823                         wastedflags |= WASTED_GC;
10824                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10825                         vWARN3(
10826                             RExC_parse + 1,
10827                             "Useless (%sc) - %suse /gc modifier",
10828                             flagsp == &negflags ? "?-" : "?",
10829                             flagsp == &negflags ? "don't " : ""
10830                         );
10831                     }
10832                 }
10833                 break;
10834             case KEEPCOPY_PAT_MOD: /* 'p' */
10835                 if (flagsp == &negflags) {
10836                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10837                 } else {
10838                     *flagsp |= RXf_PMf_KEEPCOPY;
10839                 }
10840                 break;
10841             case '-':
10842                 /* A flag is a default iff it is following a minus, so
10843                  * if there is a minus, it means will be trying to
10844                  * re-specify a default which is an error */
10845                 if (has_use_defaults || flagsp == &negflags) {
10846                     goto fail_modifiers;
10847                 }
10848                 flagsp = &negflags;
10849                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10850                 x_mod_count = 0;
10851                 break;
10852             case ':':
10853             case ')':
10854
10855                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10856                     negflags |= RXf_PMf_EXTENDED_MORE;
10857                 }
10858                 RExC_flags |= posflags;
10859
10860                 if (negflags & RXf_PMf_EXTENDED) {
10861                     negflags |= RXf_PMf_EXTENDED_MORE;
10862                 }
10863                 RExC_flags &= ~negflags;
10864                 set_regex_charset(&RExC_flags, cs);
10865
10866                 return;
10867             default:
10868               fail_modifiers:
10869                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
10870                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10871                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10872                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10873                 NOT_REACHED; /*NOTREACHED*/
10874         }
10875
10876         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10877     }
10878
10879     vFAIL("Sequence (?... not terminated");
10880 }
10881
10882 /*
10883  - reg - regular expression, i.e. main body or parenthesized thing
10884  *
10885  * Caller must absorb opening parenthesis.
10886  *
10887  * Combining parenthesis handling with the base level of regular expression
10888  * is a trifle forced, but the need to tie the tails of the branches to what
10889  * follows makes it hard to avoid.
10890  */
10891 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10892 #ifdef DEBUGGING
10893 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10894 #else
10895 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10896 #endif
10897
10898 PERL_STATIC_INLINE regnode_offset
10899 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10900                              I32 *flagp,
10901                              char * parse_start,
10902                              char ch
10903                       )
10904 {
10905     regnode_offset ret;
10906     char* name_start = RExC_parse;
10907     U32 num = 0;
10908     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
10909     GET_RE_DEBUG_FLAGS_DECL;
10910
10911     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10912
10913     if (RExC_parse == name_start || *RExC_parse != ch) {
10914         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10915         vFAIL2("Sequence %.3s... not terminated", parse_start);
10916     }
10917
10918     if (sv_dat) {
10919         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10920         RExC_rxi->data->data[num]=(void*)sv_dat;
10921         SvREFCNT_inc_simple_void_NN(sv_dat);
10922     }
10923     RExC_sawback = 1;
10924     ret = reganode(pRExC_state,
10925                    ((! FOLD)
10926                      ? REFN
10927                      : (ASCII_FOLD_RESTRICTED)
10928                        ? REFFAN
10929                        : (AT_LEAST_UNI_SEMANTICS)
10930                          ? REFFUN
10931                          : (LOC)
10932                            ? REFFLN
10933                            : REFFN),
10934                     num);
10935     *flagp |= HASWIDTH;
10936
10937     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
10938     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
10939
10940     nextchar(pRExC_state);
10941     return ret;
10942 }
10943
10944 /* On success, returns the offset at which any next node should be placed into
10945  * the regex engine program being compiled.
10946  *
10947  * Returns 0 otherwise, with *flagp set to indicate why:
10948  *  TRYAGAIN        at the end of (?) that only sets flags.
10949  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
10950  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
10951  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
10952  *  happen.  */
10953 STATIC regnode_offset
10954 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
10955     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10956      * 2 is like 1, but indicates that nextchar() has been called to advance
10957      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
10958      * this flag alerts us to the need to check for that */
10959 {
10960     regnode_offset ret = 0;    /* Will be the head of the group. */
10961     regnode_offset br;
10962     regnode_offset lastbr;
10963     regnode_offset ender = 0;
10964     I32 parno = 0;
10965     I32 flags;
10966     U32 oregflags = RExC_flags;
10967     bool have_branch = 0;
10968     bool is_open = 0;
10969     I32 freeze_paren = 0;
10970     I32 after_freeze = 0;
10971     I32 num; /* numeric backreferences */
10972     SV * max_open;  /* Max number of unclosed parens */
10973
10974     char * parse_start = RExC_parse; /* MJD */
10975     char * const oregcomp_parse = RExC_parse;
10976
10977     GET_RE_DEBUG_FLAGS_DECL;
10978
10979     PERL_ARGS_ASSERT_REG;
10980     DEBUG_PARSE("reg ");
10981
10982     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
10983     assert(max_open);
10984     if (!SvIOK(max_open)) {
10985         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
10986     }
10987     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
10988                                               open paren */
10989         vFAIL("Too many nested open parens");
10990     }
10991
10992     *flagp = 0;                         /* Tentatively. */
10993
10994     if (RExC_in_lookbehind) {
10995         RExC_in_lookbehind++;
10996     }
10997     if (RExC_in_lookahead) {
10998         RExC_in_lookahead++;
10999     }
11000
11001     /* Having this true makes it feasible to have a lot fewer tests for the
11002      * parse pointer being in scope.  For example, we can write
11003      *      while(isFOO(*RExC_parse)) RExC_parse++;
11004      * instead of
11005      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11006      */
11007     assert(*RExC_end == '\0');
11008
11009     /* Make an OPEN node, if parenthesized. */
11010     if (paren) {
11011
11012         /* Under /x, space and comments can be gobbled up between the '(' and
11013          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11014          * intervening space, as the sequence is a token, and a token should be
11015          * indivisible */
11016         bool has_intervening_patws = (paren == 2)
11017                                   && *(RExC_parse - 1) != '(';
11018
11019         if (RExC_parse >= RExC_end) {
11020             vFAIL("Unmatched (");
11021         }
11022
11023         if (paren == 'r') {     /* Atomic script run */
11024             paren = '>';
11025             goto parse_rest;
11026         }
11027         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11028             char *start_verb = RExC_parse + 1;
11029             STRLEN verb_len;
11030             char *start_arg = NULL;
11031             unsigned char op = 0;
11032             int arg_required = 0;
11033             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11034             bool has_upper = FALSE;
11035
11036             if (has_intervening_patws) {
11037                 RExC_parse++;   /* past the '*' */
11038
11039                 /* For strict backwards compatibility, don't change the message
11040                  * now that we also have lowercase operands */
11041                 if (isUPPER(*RExC_parse)) {
11042                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11043                 }
11044                 else {
11045                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11046                 }
11047             }
11048             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11049                 if ( *RExC_parse == ':' ) {
11050                     start_arg = RExC_parse + 1;
11051                     break;
11052                 }
11053                 else if (! UTF) {
11054                     if (isUPPER(*RExC_parse)) {
11055                         has_upper = TRUE;
11056                     }
11057                     RExC_parse++;
11058                 }
11059                 else {
11060                     RExC_parse += UTF8SKIP(RExC_parse);
11061                 }
11062             }
11063             verb_len = RExC_parse - start_verb;
11064             if ( start_arg ) {
11065                 if (RExC_parse >= RExC_end) {
11066                     goto unterminated_verb_pattern;
11067                 }
11068
11069                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11070                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11071                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11072                 }
11073                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11074                   unterminated_verb_pattern:
11075                     if (has_upper) {
11076                         vFAIL("Unterminated verb pattern argument");
11077                     }
11078                     else {
11079                         vFAIL("Unterminated '(*...' argument");
11080                     }
11081                 }
11082             } else {
11083                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11084                     if (has_upper) {
11085                         vFAIL("Unterminated verb pattern");
11086                     }
11087                     else {
11088                         vFAIL("Unterminated '(*...' construct");
11089                     }
11090                 }
11091             }
11092
11093             /* Here, we know that RExC_parse < RExC_end */
11094
11095             switch ( *start_verb ) {
11096             case 'A':  /* (*ACCEPT) */
11097                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11098                     op = ACCEPT;
11099                     internal_argval = RExC_nestroot;
11100                 }
11101                 break;
11102             case 'C':  /* (*COMMIT) */
11103                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11104                     op = COMMIT;
11105                 break;
11106             case 'F':  /* (*FAIL) */
11107                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11108                     op = OPFAIL;
11109                 }
11110                 break;
11111             case ':':  /* (*:NAME) */
11112             case 'M':  /* (*MARK:NAME) */
11113                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11114                     op = MARKPOINT;
11115                     arg_required = 1;
11116                 }
11117                 break;
11118             case 'P':  /* (*PRUNE) */
11119                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11120                     op = PRUNE;
11121                 break;
11122             case 'S':   /* (*SKIP) */
11123                 if ( memEQs(start_verb, verb_len,"SKIP") )
11124                     op = SKIP;
11125                 break;
11126             case 'T':  /* (*THEN) */
11127                 /* [19:06] <TimToady> :: is then */
11128                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11129                     op = CUTGROUP;
11130                     RExC_seen |= REG_CUTGROUP_SEEN;
11131                 }
11132                 break;
11133             case 'a':
11134                 if (   memEQs(start_verb, verb_len, "asr")
11135                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11136                 {
11137                     paren = 'r';        /* Mnemonic: recursed run */
11138                     goto script_run;
11139                 }
11140                 else if (memEQs(start_verb, verb_len, "atomic")) {
11141                     paren = 't';    /* AtOMIC */
11142                     goto alpha_assertions;
11143                 }
11144                 break;
11145             case 'p':
11146                 if (   memEQs(start_verb, verb_len, "plb")
11147                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11148                 {
11149                     paren = 'b';
11150                     goto lookbehind_alpha_assertions;
11151                 }
11152                 else if (   memEQs(start_verb, verb_len, "pla")
11153                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11154                 {
11155                     paren = 'a';
11156                     goto alpha_assertions;
11157                 }
11158                 break;
11159             case 'n':
11160                 if (   memEQs(start_verb, verb_len, "nlb")
11161                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11162                 {
11163                     paren = 'B';
11164                     goto lookbehind_alpha_assertions;
11165                 }
11166                 else if (   memEQs(start_verb, verb_len, "nla")
11167                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11168                 {
11169                     paren = 'A';
11170                     goto alpha_assertions;
11171                 }
11172                 break;
11173             case 's':
11174                 if (   memEQs(start_verb, verb_len, "sr")
11175                     || memEQs(start_verb, verb_len, "script_run"))
11176                 {
11177                     regnode_offset atomic;
11178
11179                     paren = 's';
11180
11181                    script_run:
11182
11183                     /* This indicates Unicode rules. */
11184                     REQUIRE_UNI_RULES(flagp, 0);
11185
11186                     if (! start_arg) {
11187                         goto no_colon;
11188                     }
11189
11190                     RExC_parse = start_arg;
11191
11192                     if (RExC_in_script_run) {
11193
11194                         /*  Nested script runs are treated as no-ops, because
11195                          *  if the nested one fails, the outer one must as
11196                          *  well.  It could fail sooner, and avoid (??{} with
11197                          *  side effects, but that is explicitly documented as
11198                          *  undefined behavior. */
11199
11200                         ret = 0;
11201
11202                         if (paren == 's') {
11203                             paren = ':';
11204                             goto parse_rest;
11205                         }
11206
11207                         /* But, the atomic part of a nested atomic script run
11208                          * isn't a no-op, but can be treated just like a '(?>'
11209                          * */
11210                         paren = '>';
11211                         goto parse_rest;
11212                     }
11213
11214                     if (paren == 's') {
11215                         /* Here, we're starting a new regular script run */
11216                         ret = reg_node(pRExC_state, SROPEN);
11217                         RExC_in_script_run = 1;
11218                         is_open = 1;
11219                         goto parse_rest;
11220                     }
11221
11222                     /* Here, we are starting an atomic script run.  This is
11223                      * handled by recursing to deal with the atomic portion
11224                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11225
11226                     ret = reg_node(pRExC_state, SROPEN);
11227
11228                     RExC_in_script_run = 1;
11229
11230                     atomic = reg(pRExC_state, 'r', &flags, depth);
11231                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11232                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11233                         return 0;
11234                     }
11235
11236                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11237                         REQUIRE_BRANCHJ(flagp, 0);
11238                     }
11239
11240                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11241                                                                 SRCLOSE)))
11242                     {
11243                         REQUIRE_BRANCHJ(flagp, 0);
11244                     }
11245
11246                     RExC_in_script_run = 0;
11247                     return ret;
11248                 }
11249
11250                 break;
11251
11252             lookbehind_alpha_assertions:
11253                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11254                 RExC_in_lookbehind++;
11255                 /*FALLTHROUGH*/
11256
11257             alpha_assertions:
11258
11259                 RExC_seen_zerolen++;
11260
11261                 if (! start_arg) {
11262                     goto no_colon;
11263                 }
11264
11265                 /* An empty negative lookahead assertion simply is failure */
11266                 if (paren == 'A' && RExC_parse == start_arg) {
11267                     ret=reganode(pRExC_state, OPFAIL, 0);
11268                     nextchar(pRExC_state);
11269                     return ret;
11270                 }
11271
11272                 RExC_parse = start_arg;
11273                 goto parse_rest;
11274
11275               no_colon:
11276                 vFAIL2utf8f(
11277                 "'(*%" UTF8f "' requires a terminating ':'",
11278                 UTF8fARG(UTF, verb_len, start_verb));
11279                 NOT_REACHED; /*NOTREACHED*/
11280
11281             } /* End of switch */
11282             if ( ! op ) {
11283                 RExC_parse += UTF
11284                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11285                               : 1;
11286                 if (has_upper || verb_len == 0) {
11287                     vFAIL2utf8f(
11288                     "Unknown verb pattern '%" UTF8f "'",
11289                     UTF8fARG(UTF, verb_len, start_verb));
11290                 }
11291                 else {
11292                     vFAIL2utf8f(
11293                     "Unknown '(*...)' construct '%" UTF8f "'",
11294                     UTF8fARG(UTF, verb_len, start_verb));
11295                 }
11296             }
11297             if ( RExC_parse == start_arg ) {
11298                 start_arg = NULL;
11299             }
11300             if ( arg_required && !start_arg ) {
11301                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11302                     verb_len, start_verb);
11303             }
11304             if (internal_argval == -1) {
11305                 ret = reganode(pRExC_state, op, 0);
11306             } else {
11307                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11308             }
11309             RExC_seen |= REG_VERBARG_SEEN;
11310             if (start_arg) {
11311                 SV *sv = newSVpvn( start_arg,
11312                                     RExC_parse - start_arg);
11313                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11314                                         STR_WITH_LEN("S"));
11315                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11316                 FLAGS(REGNODE_p(ret)) = 1;
11317             } else {
11318                 FLAGS(REGNODE_p(ret)) = 0;
11319             }
11320             if ( internal_argval != -1 )
11321                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11322             nextchar(pRExC_state);
11323             return ret;
11324         }
11325         else if (*RExC_parse == '?') { /* (?...) */
11326             bool is_logical = 0;
11327             const char * const seqstart = RExC_parse;
11328             const char * endptr;
11329             if (has_intervening_patws) {
11330                 RExC_parse++;
11331                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11332             }
11333
11334             RExC_parse++;           /* past the '?' */
11335             paren = *RExC_parse;    /* might be a trailing NUL, if not
11336                                        well-formed */
11337             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11338             if (RExC_parse > RExC_end) {
11339                 paren = '\0';
11340             }
11341             ret = 0;                    /* For look-ahead/behind. */
11342             switch (paren) {
11343
11344             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11345                 paren = *RExC_parse;
11346                 if ( paren == '<') {    /* (?P<...>) named capture */
11347                     RExC_parse++;
11348                     if (RExC_parse >= RExC_end) {
11349                         vFAIL("Sequence (?P<... not terminated");
11350                     }
11351                     goto named_capture;
11352                 }
11353                 else if (paren == '>') {   /* (?P>name) named recursion */
11354                     RExC_parse++;
11355                     if (RExC_parse >= RExC_end) {
11356                         vFAIL("Sequence (?P>... not terminated");
11357                     }
11358                     goto named_recursion;
11359                 }
11360                 else if (paren == '=') {   /* (?P=...)  named backref */
11361                     RExC_parse++;
11362                     return handle_named_backref(pRExC_state, flagp,
11363                                                 parse_start, ')');
11364                 }
11365                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11366                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11367                 vFAIL3("Sequence (%.*s...) not recognized",
11368                                 RExC_parse-seqstart, seqstart);
11369                 NOT_REACHED; /*NOTREACHED*/
11370             case '<':           /* (?<...) */
11371                 if (*RExC_parse == '!')
11372                     paren = ',';
11373                 else if (*RExC_parse != '=')
11374               named_capture:
11375                 {               /* (?<...>) */
11376                     char *name_start;
11377                     SV *svname;
11378                     paren= '>';
11379                 /* FALLTHROUGH */
11380             case '\'':          /* (?'...') */
11381                     name_start = RExC_parse;
11382                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11383                     if (   RExC_parse == name_start
11384                         || RExC_parse >= RExC_end
11385                         || *RExC_parse != paren)
11386                     {
11387                         vFAIL2("Sequence (?%c... not terminated",
11388                             paren=='>' ? '<' : paren);
11389                     }
11390                     {
11391                         HE *he_str;
11392                         SV *sv_dat = NULL;
11393                         if (!svname) /* shouldn't happen */
11394                             Perl_croak(aTHX_
11395                                 "panic: reg_scan_name returned NULL");
11396                         if (!RExC_paren_names) {
11397                             RExC_paren_names= newHV();
11398                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11399 #ifdef DEBUGGING
11400                             RExC_paren_name_list= newAV();
11401                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11402 #endif
11403                         }
11404                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11405                         if ( he_str )
11406                             sv_dat = HeVAL(he_str);
11407                         if ( ! sv_dat ) {
11408                             /* croak baby croak */
11409                             Perl_croak(aTHX_
11410                                 "panic: paren_name hash element allocation failed");
11411                         } else if ( SvPOK(sv_dat) ) {
11412                             /* (?|...) can mean we have dupes so scan to check
11413                                its already been stored. Maybe a flag indicating
11414                                we are inside such a construct would be useful,
11415                                but the arrays are likely to be quite small, so
11416                                for now we punt -- dmq */
11417                             IV count = SvIV(sv_dat);
11418                             I32 *pv = (I32*)SvPVX(sv_dat);
11419                             IV i;
11420                             for ( i = 0 ; i < count ; i++ ) {
11421                                 if ( pv[i] == RExC_npar ) {
11422                                     count = 0;
11423                                     break;
11424                                 }
11425                             }
11426                             if ( count ) {
11427                                 pv = (I32*)SvGROW(sv_dat,
11428                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11429                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11430                                 pv[count] = RExC_npar;
11431                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11432                             }
11433                         } else {
11434                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11435                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11436                                                                 sizeof(I32));
11437                             SvIOK_on(sv_dat);
11438                             SvIV_set(sv_dat, 1);
11439                         }
11440 #ifdef DEBUGGING
11441                         /* Yes this does cause a memory leak in debugging Perls
11442                          * */
11443                         if (!av_store(RExC_paren_name_list,
11444                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11445                             SvREFCNT_dec_NN(svname);
11446 #endif
11447
11448                         /*sv_dump(sv_dat);*/
11449                     }
11450                     nextchar(pRExC_state);
11451                     paren = 1;
11452                     goto capturing_parens;
11453                 }
11454
11455                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11456                 RExC_in_lookbehind++;
11457                 RExC_parse++;
11458                 if (RExC_parse >= RExC_end) {
11459                     vFAIL("Sequence (?... not terminated");
11460                 }
11461                 RExC_seen_zerolen++;
11462                 break;
11463             case '=':           /* (?=...) */
11464                 RExC_seen_zerolen++;
11465                 RExC_in_lookahead++;
11466                 break;
11467             case '!':           /* (?!...) */
11468                 RExC_seen_zerolen++;
11469                 /* check if we're really just a "FAIL" assertion */
11470                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11471                                         FALSE /* Don't force to /x */ );
11472                 if (*RExC_parse == ')') {
11473                     ret=reganode(pRExC_state, OPFAIL, 0);
11474                     nextchar(pRExC_state);
11475                     return ret;
11476                 }
11477                 break;
11478             case '|':           /* (?|...) */
11479                 /* branch reset, behave like a (?:...) except that
11480                    buffers in alternations share the same numbers */
11481                 paren = ':';
11482                 after_freeze = freeze_paren = RExC_npar;
11483
11484                 /* XXX This construct currently requires an extra pass.
11485                  * Investigation would be required to see if that could be
11486                  * changed */
11487                 REQUIRE_PARENS_PASS;
11488                 break;
11489             case ':':           /* (?:...) */
11490             case '>':           /* (?>...) */
11491                 break;
11492             case '$':           /* (?$...) */
11493             case '@':           /* (?@...) */
11494                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11495                 break;
11496             case '0' :           /* (?0) */
11497             case 'R' :           /* (?R) */
11498                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11499                     FAIL("Sequence (?R) not terminated");
11500                 num = 0;
11501                 RExC_seen |= REG_RECURSE_SEEN;
11502
11503                 /* XXX These constructs currently require an extra pass.
11504                  * It probably could be changed */
11505                 REQUIRE_PARENS_PASS;
11506
11507                 *flagp |= POSTPONED;
11508                 goto gen_recurse_regop;
11509                 /*notreached*/
11510             /* named and numeric backreferences */
11511             case '&':            /* (?&NAME) */
11512                 parse_start = RExC_parse - 1;
11513               named_recursion:
11514                 {
11515                     SV *sv_dat = reg_scan_name(pRExC_state,
11516                                                REG_RSN_RETURN_DATA);
11517                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11518                 }
11519                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11520                     vFAIL("Sequence (?&... not terminated");
11521                 goto gen_recurse_regop;
11522                 /* NOTREACHED */
11523             case '+':
11524                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11525                     RExC_parse++;
11526                     vFAIL("Illegal pattern");
11527                 }
11528                 goto parse_recursion;
11529                 /* NOTREACHED*/
11530             case '-': /* (?-1) */
11531                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11532                     RExC_parse--; /* rewind to let it be handled later */
11533                     goto parse_flags;
11534                 }
11535                 /* FALLTHROUGH */
11536             case '1': case '2': case '3': case '4': /* (?1) */
11537             case '5': case '6': case '7': case '8': case '9':
11538                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11539               parse_recursion:
11540                 {
11541                     bool is_neg = FALSE;
11542                     UV unum;
11543                     parse_start = RExC_parse - 1; /* MJD */
11544                     if (*RExC_parse == '-') {
11545                         RExC_parse++;
11546                         is_neg = TRUE;
11547                     }
11548                     endptr = RExC_end;
11549                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11550                         && unum <= I32_MAX
11551                     ) {
11552                         num = (I32)unum;
11553                         RExC_parse = (char*)endptr;
11554                     } else
11555                         num = I32_MAX;
11556                     if (is_neg) {
11557                         /* Some limit for num? */
11558                         num = -num;
11559                     }
11560                 }
11561                 if (*RExC_parse!=')')
11562                     vFAIL("Expecting close bracket");
11563
11564               gen_recurse_regop:
11565                 if ( paren == '-' ) {
11566                     /*
11567                     Diagram of capture buffer numbering.
11568                     Top line is the normal capture buffer numbers
11569                     Bottom line is the negative indexing as from
11570                     the X (the (?-2))
11571
11572                     +   1 2    3 4 5 X          6 7
11573                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11574                     -   5 4    3 2 1 X          x x
11575
11576                     */
11577                     num = RExC_npar + num;
11578                     if (num < 1)  {
11579
11580                         /* It might be a forward reference; we can't fail until
11581                          * we know, by completing the parse to get all the
11582                          * groups, and then reparsing */
11583                         if (ALL_PARENS_COUNTED)  {
11584                             RExC_parse++;
11585                             vFAIL("Reference to nonexistent group");
11586                         }
11587                         else {
11588                             REQUIRE_PARENS_PASS;
11589                         }
11590                     }
11591                 } else if ( paren == '+' ) {
11592                     num = RExC_npar + num - 1;
11593                 }
11594                 /* We keep track how many GOSUB items we have produced.
11595                    To start off the ARG2L() of the GOSUB holds its "id",
11596                    which is used later in conjunction with RExC_recurse
11597                    to calculate the offset we need to jump for the GOSUB,
11598                    which it will store in the final representation.
11599                    We have to defer the actual calculation until much later
11600                    as the regop may move.
11601                  */
11602
11603                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11604                 if (num >= RExC_npar) {
11605
11606                     /* It might be a forward reference; we can't fail until we
11607                      * know, by completing the parse to get all the groups, and
11608                      * then reparsing */
11609                     if (ALL_PARENS_COUNTED)  {
11610                         if (num >= RExC_total_parens) {
11611                             RExC_parse++;
11612                             vFAIL("Reference to nonexistent group");
11613                         }
11614                     }
11615                     else {
11616                         REQUIRE_PARENS_PASS;
11617                     }
11618                 }
11619                 RExC_recurse_count++;
11620                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11621                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11622                             22, "|    |", (int)(depth * 2 + 1), "",
11623                             (UV)ARG(REGNODE_p(ret)),
11624                             (IV)ARG2L(REGNODE_p(ret))));
11625                 RExC_seen |= REG_RECURSE_SEEN;
11626
11627                 Set_Node_Length(REGNODE_p(ret),
11628                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11629                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11630
11631                 *flagp |= POSTPONED;
11632                 assert(*RExC_parse == ')');
11633                 nextchar(pRExC_state);
11634                 return ret;
11635
11636             /* NOTREACHED */
11637
11638             case '?':           /* (??...) */
11639                 is_logical = 1;
11640                 if (*RExC_parse != '{') {
11641                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11642                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11643                     vFAIL2utf8f(
11644                         "Sequence (%" UTF8f "...) not recognized",
11645                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11646                     NOT_REACHED; /*NOTREACHED*/
11647                 }
11648                 *flagp |= POSTPONED;
11649                 paren = '{';
11650                 RExC_parse++;
11651                 /* FALLTHROUGH */
11652             case '{':           /* (?{...}) */
11653             {
11654                 U32 n = 0;
11655                 struct reg_code_block *cb;
11656                 OP * o;
11657
11658                 RExC_seen_zerolen++;
11659
11660                 if (   !pRExC_state->code_blocks
11661                     || pRExC_state->code_index
11662                                         >= pRExC_state->code_blocks->count
11663                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11664                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11665                             - RExC_start)
11666                 ) {
11667                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11668                         FAIL("panic: Sequence (?{...}): no code block found\n");
11669                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11670                 }
11671                 /* this is a pre-compiled code block (?{...}) */
11672                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11673                 RExC_parse = RExC_start + cb->end;
11674                 o = cb->block;
11675                 if (cb->src_regex) {
11676                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11677                     RExC_rxi->data->data[n] =
11678                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11679                     RExC_rxi->data->data[n+1] = (void*)o;
11680                 }
11681                 else {
11682                     n = add_data(pRExC_state,
11683                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11684                     RExC_rxi->data->data[n] = (void*)o;
11685                 }
11686                 pRExC_state->code_index++;
11687                 nextchar(pRExC_state);
11688
11689                 if (is_logical) {
11690                     regnode_offset eval;
11691                     ret = reg_node(pRExC_state, LOGICAL);
11692
11693                     eval = reg2Lanode(pRExC_state, EVAL,
11694                                        n,
11695
11696                                        /* for later propagation into (??{})
11697                                         * return value */
11698                                        RExC_flags & RXf_PMf_COMPILETIME
11699                                       );
11700                     FLAGS(REGNODE_p(ret)) = 2;
11701                     if (! REGTAIL(pRExC_state, ret, eval)) {
11702                         REQUIRE_BRANCHJ(flagp, 0);
11703                     }
11704                     /* deal with the length of this later - MJD */
11705                     return ret;
11706                 }
11707                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11708                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11709                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11710                 return ret;
11711             }
11712             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11713             {
11714                 int is_define= 0;
11715                 const int DEFINE_len = sizeof("DEFINE") - 1;
11716                 if (    RExC_parse < RExC_end - 1
11717                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11718                             && (   RExC_parse[1] == '='
11719                                 || RExC_parse[1] == '!'
11720                                 || RExC_parse[1] == '<'
11721                                 || RExC_parse[1] == '{'))
11722                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11723                             && (   memBEGINs(RExC_parse + 1,
11724                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11725                                          "pla:")
11726                                 || memBEGINs(RExC_parse + 1,
11727                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11728                                          "plb:")
11729                                 || memBEGINs(RExC_parse + 1,
11730                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11731                                          "nla:")
11732                                 || memBEGINs(RExC_parse + 1,
11733                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11734                                          "nlb:")
11735                                 || memBEGINs(RExC_parse + 1,
11736                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11737                                          "positive_lookahead:")
11738                                 || memBEGINs(RExC_parse + 1,
11739                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11740                                          "positive_lookbehind:")
11741                                 || memBEGINs(RExC_parse + 1,
11742                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11743                                          "negative_lookahead:")
11744                                 || memBEGINs(RExC_parse + 1,
11745                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11746                                          "negative_lookbehind:"))))
11747                 ) { /* Lookahead or eval. */
11748                     I32 flag;
11749                     regnode_offset tail;
11750
11751                     ret = reg_node(pRExC_state, LOGICAL);
11752                     FLAGS(REGNODE_p(ret)) = 1;
11753
11754                     tail = reg(pRExC_state, 1, &flag, depth+1);
11755                     RETURN_FAIL_ON_RESTART(flag, flagp);
11756                     if (! REGTAIL(pRExC_state, ret, tail)) {
11757                         REQUIRE_BRANCHJ(flagp, 0);
11758                     }
11759                     goto insert_if;
11760                 }
11761                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11762                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11763                 {
11764                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11765                     char *name_start= RExC_parse++;
11766                     U32 num = 0;
11767                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11768                     if (   RExC_parse == name_start
11769                         || RExC_parse >= RExC_end
11770                         || *RExC_parse != ch)
11771                     {
11772                         vFAIL2("Sequence (?(%c... not terminated",
11773                             (ch == '>' ? '<' : ch));
11774                     }
11775                     RExC_parse++;
11776                     if (sv_dat) {
11777                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11778                         RExC_rxi->data->data[num]=(void*)sv_dat;
11779                         SvREFCNT_inc_simple_void_NN(sv_dat);
11780                     }
11781                     ret = reganode(pRExC_state, GROUPPN, num);
11782                     goto insert_if_check_paren;
11783                 }
11784                 else if (memBEGINs(RExC_parse,
11785                                    (STRLEN) (RExC_end - RExC_parse),
11786                                    "DEFINE"))
11787                 {
11788                     ret = reganode(pRExC_state, DEFINEP, 0);
11789                     RExC_parse += DEFINE_len;
11790                     is_define = 1;
11791                     goto insert_if_check_paren;
11792                 }
11793                 else if (RExC_parse[0] == 'R') {
11794                     RExC_parse++;
11795                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11796                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11797                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11798                      */
11799                     parno = 0;
11800                     if (RExC_parse[0] == '0') {
11801                         parno = 1;
11802                         RExC_parse++;
11803                     }
11804                     else if (inRANGE(RExC_parse[0], '1', '9')) {
11805                         UV uv;
11806                         endptr = RExC_end;
11807                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11808                             && uv <= I32_MAX
11809                         ) {
11810                             parno = (I32)uv + 1;
11811                             RExC_parse = (char*)endptr;
11812                         }
11813                         /* else "Switch condition not recognized" below */
11814                     } else if (RExC_parse[0] == '&') {
11815                         SV *sv_dat;
11816                         RExC_parse++;
11817                         sv_dat = reg_scan_name(pRExC_state,
11818                                                REG_RSN_RETURN_DATA);
11819                         if (sv_dat)
11820                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11821                     }
11822                     ret = reganode(pRExC_state, INSUBP, parno);
11823                     goto insert_if_check_paren;
11824                 }
11825                 else if (inRANGE(RExC_parse[0], '1', '9')) {
11826                     /* (?(1)...) */
11827                     char c;
11828                     UV uv;
11829                     endptr = RExC_end;
11830                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11831                         && uv <= I32_MAX
11832                     ) {
11833                         parno = (I32)uv;
11834                         RExC_parse = (char*)endptr;
11835                     }
11836                     else {
11837                         vFAIL("panic: grok_atoUV returned FALSE");
11838                     }
11839                     ret = reganode(pRExC_state, GROUPP, parno);
11840
11841                  insert_if_check_paren:
11842                     if (UCHARAT(RExC_parse) != ')') {
11843                         RExC_parse += UTF
11844                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11845                                       : 1;
11846                         vFAIL("Switch condition not recognized");
11847                     }
11848                     nextchar(pRExC_state);
11849                   insert_if:
11850                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
11851                                                              IFTHEN, 0)))
11852                     {
11853                         REQUIRE_BRANCHJ(flagp, 0);
11854                     }
11855                     br = regbranch(pRExC_state, &flags, 1, depth+1);
11856                     if (br == 0) {
11857                         RETURN_FAIL_ON_RESTART(flags,flagp);
11858                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11859                               (UV) flags);
11860                     } else
11861                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
11862                                                              LONGJMP, 0)))
11863                     {
11864                         REQUIRE_BRANCHJ(flagp, 0);
11865                     }
11866                     c = UCHARAT(RExC_parse);
11867                     nextchar(pRExC_state);
11868                     if (flags&HASWIDTH)
11869                         *flagp |= HASWIDTH;
11870                     if (c == '|') {
11871                         if (is_define)
11872                             vFAIL("(?(DEFINE)....) does not allow branches");
11873
11874                         /* Fake one for optimizer.  */
11875                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11876
11877                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11878                             RETURN_FAIL_ON_RESTART(flags, flagp);
11879                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11880                                   (UV) flags);
11881                         }
11882                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
11883                             REQUIRE_BRANCHJ(flagp, 0);
11884                         }
11885                         if (flags&HASWIDTH)
11886                             *flagp |= HASWIDTH;
11887                         c = UCHARAT(RExC_parse);
11888                         nextchar(pRExC_state);
11889                     }
11890                     else
11891                         lastbr = 0;
11892                     if (c != ')') {
11893                         if (RExC_parse >= RExC_end)
11894                             vFAIL("Switch (?(condition)... not terminated");
11895                         else
11896                             vFAIL("Switch (?(condition)... contains too many branches");
11897                     }
11898                     ender = reg_node(pRExC_state, TAIL);
11899                     if (! REGTAIL(pRExC_state, br, ender)) {
11900                         REQUIRE_BRANCHJ(flagp, 0);
11901                     }
11902                     if (lastbr) {
11903                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
11904                             REQUIRE_BRANCHJ(flagp, 0);
11905                         }
11906                         if (! REGTAIL(pRExC_state,
11907                                       REGNODE_OFFSET(
11908                                                  NEXTOPER(
11909                                                  NEXTOPER(REGNODE_p(lastbr)))),
11910                                       ender))
11911                         {
11912                             REQUIRE_BRANCHJ(flagp, 0);
11913                         }
11914                     }
11915                     else
11916                         if (! REGTAIL(pRExC_state, ret, ender)) {
11917                             REQUIRE_BRANCHJ(flagp, 0);
11918                         }
11919 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
11920                     RExC_size++; /* XXX WHY do we need this?!!
11921                                     For large programs it seems to be required
11922                                     but I can't figure out why. -- dmq*/
11923 #endif
11924                     return ret;
11925                 }
11926                 RExC_parse += UTF
11927                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11928                               : 1;
11929                 vFAIL("Unknown switch condition (?(...))");
11930             }
11931             case '[':           /* (?[ ... ]) */
11932                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11933                                          oregcomp_parse);
11934             case 0: /* A NUL */
11935                 RExC_parse--; /* for vFAIL to print correctly */
11936                 vFAIL("Sequence (? incomplete");
11937                 break;
11938
11939             case ')':
11940                 if (RExC_strict) {  /* [perl #132851] */
11941                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
11942                 }
11943                 /* FALLTHROUGH */
11944             default: /* e.g., (?i) */
11945                 RExC_parse = (char *) seqstart + 1;
11946               parse_flags:
11947                 parse_lparen_question_flags(pRExC_state);
11948                 if (UCHARAT(RExC_parse) != ':') {
11949                     if (RExC_parse < RExC_end)
11950                         nextchar(pRExC_state);
11951                     *flagp = TRYAGAIN;
11952                     return 0;
11953                 }
11954                 paren = ':';
11955                 nextchar(pRExC_state);
11956                 ret = 0;
11957                 goto parse_rest;
11958             } /* end switch */
11959         }
11960         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
11961           capturing_parens:
11962             parno = RExC_npar;
11963             RExC_npar++;
11964             if (! ALL_PARENS_COUNTED) {
11965                 /* If we are in our first pass through (and maybe only pass),
11966                  * we  need to allocate memory for the capturing parentheses
11967                  * data structures.
11968                  */
11969
11970                 if (!RExC_parens_buf_size) {
11971                     /* first guess at number of parens we might encounter */
11972                     RExC_parens_buf_size = 10;
11973
11974                     /* setup RExC_open_parens, which holds the address of each
11975                      * OPEN tag, and to make things simpler for the 0 index the
11976                      * start of the program - this is used later for offsets */
11977                     Newxz(RExC_open_parens, RExC_parens_buf_size,
11978                             regnode_offset);
11979                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
11980
11981                     /* setup RExC_close_parens, which holds the address of each
11982                      * CLOSE tag, and to make things simpler for the 0 index
11983                      * the end of the program - this is used later for offsets
11984                      * */
11985                     Newxz(RExC_close_parens, RExC_parens_buf_size,
11986                             regnode_offset);
11987                     /* we dont know where end op starts yet, so we dont need to
11988                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
11989                      * above */
11990                 }
11991                 else if (RExC_npar > RExC_parens_buf_size) {
11992                     I32 old_size = RExC_parens_buf_size;
11993
11994                     RExC_parens_buf_size *= 2;
11995
11996                     Renew(RExC_open_parens, RExC_parens_buf_size,
11997                             regnode_offset);
11998                     Zero(RExC_open_parens + old_size,
11999                             RExC_parens_buf_size - old_size, regnode_offset);
12000
12001                     Renew(RExC_close_parens, RExC_parens_buf_size,
12002                             regnode_offset);
12003                     Zero(RExC_close_parens + old_size,
12004                             RExC_parens_buf_size - old_size, regnode_offset);
12005                 }
12006             }
12007
12008             ret = reganode(pRExC_state, OPEN, parno);
12009             if (!RExC_nestroot)
12010                 RExC_nestroot = parno;
12011             if (RExC_open_parens && !RExC_open_parens[parno])
12012             {
12013                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12014                     "%*s%*s Setting open paren #%" IVdf " to %d\n",
12015                     22, "|    |", (int)(depth * 2 + 1), "",
12016                     (IV)parno, ret));
12017                 RExC_open_parens[parno]= ret;
12018             }
12019
12020             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12021             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12022             is_open = 1;
12023         } else {
12024             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12025             paren = ':';
12026             ret = 0;
12027         }
12028     }
12029     else                        /* ! paren */
12030         ret = 0;
12031
12032    parse_rest:
12033     /* Pick up the branches, linking them together. */
12034     parse_start = RExC_parse;   /* MJD */
12035     br = regbranch(pRExC_state, &flags, 1, depth+1);
12036
12037     /*     branch_len = (paren != 0); */
12038
12039     if (br == 0) {
12040         RETURN_FAIL_ON_RESTART(flags, flagp);
12041         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12042     }
12043     if (*RExC_parse == '|') {
12044         if (RExC_use_BRANCHJ) {
12045             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12046         }
12047         else {                  /* MJD */
12048             reginsert(pRExC_state, BRANCH, br, depth+1);
12049             Set_Node_Length(REGNODE_p(br), paren != 0);
12050             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12051         }
12052         have_branch = 1;
12053     }
12054     else if (paren == ':') {
12055         *flagp |= flags&SIMPLE;
12056     }
12057     if (is_open) {                              /* Starts with OPEN. */
12058         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12059             REQUIRE_BRANCHJ(flagp, 0);
12060         }
12061     }
12062     else if (paren != '?')              /* Not Conditional */
12063         ret = br;
12064     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12065     lastbr = br;
12066     while (*RExC_parse == '|') {
12067         if (RExC_use_BRANCHJ) {
12068             bool shut_gcc_up;
12069
12070             ender = reganode(pRExC_state, LONGJMP, 0);
12071
12072             /* Append to the previous. */
12073             shut_gcc_up = REGTAIL(pRExC_state,
12074                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12075                          ender);
12076             PERL_UNUSED_VAR(shut_gcc_up);
12077         }
12078         nextchar(pRExC_state);
12079         if (freeze_paren) {
12080             if (RExC_npar > after_freeze)
12081                 after_freeze = RExC_npar;
12082             RExC_npar = freeze_paren;
12083         }
12084         br = regbranch(pRExC_state, &flags, 0, depth+1);
12085
12086         if (br == 0) {
12087             RETURN_FAIL_ON_RESTART(flags, flagp);
12088             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12089         }
12090         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12091             REQUIRE_BRANCHJ(flagp, 0);
12092         }
12093         lastbr = br;
12094         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12095     }
12096
12097     if (have_branch || paren != ':') {
12098         regnode * br;
12099
12100         /* Make a closing node, and hook it on the end. */
12101         switch (paren) {
12102         case ':':
12103             ender = reg_node(pRExC_state, TAIL);
12104             break;
12105         case 1: case 2:
12106             ender = reganode(pRExC_state, CLOSE, parno);
12107             if ( RExC_close_parens ) {
12108                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12109                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
12110                         22, "|    |", (int)(depth * 2 + 1), "",
12111                         (IV)parno, ender));
12112                 RExC_close_parens[parno]= ender;
12113                 if (RExC_nestroot == parno)
12114                     RExC_nestroot = 0;
12115             }
12116             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12117             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12118             break;
12119         case 's':
12120             ender = reg_node(pRExC_state, SRCLOSE);
12121             RExC_in_script_run = 0;
12122             break;
12123         case '<':
12124         case 'a':
12125         case 'A':
12126         case 'b':
12127         case 'B':
12128         case ',':
12129         case '=':
12130         case '!':
12131             *flagp &= ~HASWIDTH;
12132             /* FALLTHROUGH */
12133         case 't':   /* aTomic */
12134         case '>':
12135             ender = reg_node(pRExC_state, SUCCEED);
12136             break;
12137         case 0:
12138             ender = reg_node(pRExC_state, END);
12139             assert(!RExC_end_op); /* there can only be one! */
12140             RExC_end_op = REGNODE_p(ender);
12141             if (RExC_close_parens) {
12142                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12143                     "%*s%*s Setting close paren #0 (END) to %d\n",
12144                     22, "|    |", (int)(depth * 2 + 1), "",
12145                     ender));
12146
12147                 RExC_close_parens[0]= ender;
12148             }
12149             break;
12150         }
12151         DEBUG_PARSE_r(
12152             DEBUG_PARSE_MSG("lsbr");
12153             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12154             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12155             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12156                           SvPV_nolen_const(RExC_mysv1),
12157                           (IV)lastbr,
12158                           SvPV_nolen_const(RExC_mysv2),
12159                           (IV)ender,
12160                           (IV)(ender - lastbr)
12161             );
12162         );
12163         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12164             REQUIRE_BRANCHJ(flagp, 0);
12165         }
12166
12167         if (have_branch) {
12168             char is_nothing= 1;
12169             if (depth==1)
12170                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12171
12172             /* Hook the tails of the branches to the closing node. */
12173             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12174                 const U8 op = PL_regkind[OP(br)];
12175                 if (op == BRANCH) {
12176                     if (! REGTAIL_STUDY(pRExC_state,
12177                                         REGNODE_OFFSET(NEXTOPER(br)),
12178                                         ender))
12179                     {
12180                         REQUIRE_BRANCHJ(flagp, 0);
12181                     }
12182                     if ( OP(NEXTOPER(br)) != NOTHING
12183                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12184                         is_nothing= 0;
12185                 }
12186                 else if (op == BRANCHJ) {
12187                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12188                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12189                                         ender);
12190                     PERL_UNUSED_VAR(shut_gcc_up);
12191                     /* for now we always disable this optimisation * /
12192                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12193                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12194                     */
12195                         is_nothing= 0;
12196                 }
12197             }
12198             if (is_nothing) {
12199                 regnode * ret_as_regnode = REGNODE_p(ret);
12200                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12201                                ? regnext(ret_as_regnode)
12202                                : ret_as_regnode;
12203                 DEBUG_PARSE_r(
12204                     DEBUG_PARSE_MSG("NADA");
12205                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12206                                      NULL, pRExC_state);
12207                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12208                                      NULL, pRExC_state);
12209                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12210                                   SvPV_nolen_const(RExC_mysv1),
12211                                   (IV)REG_NODE_NUM(ret_as_regnode),
12212                                   SvPV_nolen_const(RExC_mysv2),
12213                                   (IV)ender,
12214                                   (IV)(ender - ret)
12215                     );
12216                 );
12217                 OP(br)= NOTHING;
12218                 if (OP(REGNODE_p(ender)) == TAIL) {
12219                     NEXT_OFF(br)= 0;
12220                     RExC_emit= REGNODE_OFFSET(br) + 1;
12221                 } else {
12222                     regnode *opt;
12223                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12224                         OP(opt)= OPTIMIZED;
12225                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12226                 }
12227             }
12228         }
12229     }
12230
12231     {
12232         const char *p;
12233          /* Even/odd or x=don't care: 010101x10x */
12234         static const char parens[] = "=!aA<,>Bbt";
12235          /* flag below is set to 0 up through 'A'; 1 for larger */
12236
12237         if (paren && (p = strchr(parens, paren))) {
12238             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12239             int flag = (p - parens) > 3;
12240
12241             if (paren == '>' || paren == 't') {
12242                 node = SUSPEND, flag = 0;
12243             }
12244
12245             reginsert(pRExC_state, node, ret, depth+1);
12246             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12247             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12248             FLAGS(REGNODE_p(ret)) = flag;
12249             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12250             {
12251                 REQUIRE_BRANCHJ(flagp, 0);
12252             }
12253         }
12254     }
12255
12256     /* Check for proper termination. */
12257     if (paren) {
12258         /* restore original flags, but keep (?p) and, if we've encountered
12259          * something in the parse that changes /d rules into /u, keep the /u */
12260         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12261         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12262             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12263         }
12264         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12265             RExC_parse = oregcomp_parse;
12266             vFAIL("Unmatched (");
12267         }
12268         nextchar(pRExC_state);
12269     }
12270     else if (!paren && RExC_parse < RExC_end) {
12271         if (*RExC_parse == ')') {
12272             RExC_parse++;
12273             vFAIL("Unmatched )");
12274         }
12275         else
12276             FAIL("Junk on end of regexp");      /* "Can't happen". */
12277         NOT_REACHED; /* NOTREACHED */
12278     }
12279
12280     if (RExC_in_lookbehind) {
12281         RExC_in_lookbehind--;
12282     }
12283     if (RExC_in_lookahead) {
12284         RExC_in_lookahead--;
12285     }
12286     if (after_freeze > RExC_npar)
12287         RExC_npar = after_freeze;
12288     return(ret);
12289 }
12290
12291 /*
12292  - regbranch - one alternative of an | operator
12293  *
12294  * Implements the concatenation operator.
12295  *
12296  * On success, returns the offset at which any next node should be placed into
12297  * the regex engine program being compiled.
12298  *
12299  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12300  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12301  * UTF-8
12302  */
12303 STATIC regnode_offset
12304 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12305 {
12306     regnode_offset ret;
12307     regnode_offset chain = 0;
12308     regnode_offset latest;
12309     I32 flags = 0, c = 0;
12310     GET_RE_DEBUG_FLAGS_DECL;
12311
12312     PERL_ARGS_ASSERT_REGBRANCH;
12313
12314     DEBUG_PARSE("brnc");
12315
12316     if (first)
12317         ret = 0;
12318     else {
12319         if (RExC_use_BRANCHJ)
12320             ret = reganode(pRExC_state, BRANCHJ, 0);
12321         else {
12322             ret = reg_node(pRExC_state, BRANCH);
12323             Set_Node_Length(REGNODE_p(ret), 1);
12324         }
12325     }
12326
12327     *flagp = WORST;                     /* Tentatively. */
12328
12329     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12330                             FALSE /* Don't force to /x */ );
12331     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12332         flags &= ~TRYAGAIN;
12333         latest = regpiece(pRExC_state, &flags, depth+1);
12334         if (latest == 0) {
12335             if (flags & TRYAGAIN)
12336                 continue;
12337             RETURN_FAIL_ON_RESTART(flags, flagp);
12338             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12339         }
12340         else if (ret == 0)
12341             ret = latest;
12342         *flagp |= flags&(HASWIDTH|POSTPONED);
12343         if (chain == 0)         /* First piece. */
12344             *flagp |= flags&SPSTART;
12345         else {
12346             /* FIXME adding one for every branch after the first is probably
12347              * excessive now we have TRIE support. (hv) */
12348             MARK_NAUGHTY(1);
12349             if (! REGTAIL(pRExC_state, chain, latest)) {
12350                 /* XXX We could just redo this branch, but figuring out what
12351                  * bookkeeping needs to be reset is a pain, and it's likely
12352                  * that other branches that goto END will also be too large */
12353                 REQUIRE_BRANCHJ(flagp, 0);
12354             }
12355         }
12356         chain = latest;
12357         c++;
12358     }
12359     if (chain == 0) {   /* Loop ran zero times. */
12360         chain = reg_node(pRExC_state, NOTHING);
12361         if (ret == 0)
12362             ret = chain;
12363     }
12364     if (c == 1) {
12365         *flagp |= flags&SIMPLE;
12366     }
12367
12368     return ret;
12369 }
12370
12371 /*
12372  - regpiece - something followed by possible quantifier * + ? {n,m}
12373  *
12374  * Note that the branching code sequences used for ? and the general cases
12375  * of * and + are somewhat optimized:  they use the same NOTHING node as
12376  * both the endmarker for their branch list and the body of the last branch.
12377  * It might seem that this node could be dispensed with entirely, but the
12378  * endmarker role is not redundant.
12379  *
12380  * On success, returns the offset at which any next node should be placed into
12381  * the regex engine program being compiled.
12382  *
12383  * Returns 0 otherwise, with *flagp set to indicate why:
12384  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12385  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12386  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12387  */
12388 STATIC regnode_offset
12389 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12390 {
12391     regnode_offset ret;
12392     char op;
12393     char *next;
12394     I32 flags;
12395     const char * const origparse = RExC_parse;
12396     I32 min;
12397     I32 max = REG_INFTY;
12398 #ifdef RE_TRACK_PATTERN_OFFSETS
12399     char *parse_start;
12400 #endif
12401     const char *maxpos = NULL;
12402     UV uv;
12403
12404     /* Save the original in case we change the emitted regop to a FAIL. */
12405     const regnode_offset orig_emit = RExC_emit;
12406
12407     GET_RE_DEBUG_FLAGS_DECL;
12408
12409     PERL_ARGS_ASSERT_REGPIECE;
12410
12411     DEBUG_PARSE("piec");
12412
12413     ret = regatom(pRExC_state, &flags, depth+1);
12414     if (ret == 0) {
12415         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12416         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12417     }
12418
12419     op = *RExC_parse;
12420
12421     if (op == '{' && regcurly(RExC_parse)) {
12422         maxpos = NULL;
12423 #ifdef RE_TRACK_PATTERN_OFFSETS
12424         parse_start = RExC_parse; /* MJD */
12425 #endif
12426         next = RExC_parse + 1;
12427         while (isDIGIT(*next) || *next == ',') {
12428             if (*next == ',') {
12429                 if (maxpos)
12430                     break;
12431                 else
12432                     maxpos = next;
12433             }
12434             next++;
12435         }
12436         if (*next == '}') {             /* got one */
12437             const char* endptr;
12438             if (!maxpos)
12439                 maxpos = next;
12440             RExC_parse++;
12441             if (isDIGIT(*RExC_parse)) {
12442                 endptr = RExC_end;
12443                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12444                     vFAIL("Invalid quantifier in {,}");
12445                 if (uv >= REG_INFTY)
12446                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12447                 min = (I32)uv;
12448             } else {
12449                 min = 0;
12450             }
12451             if (*maxpos == ',')
12452                 maxpos++;
12453             else
12454                 maxpos = RExC_parse;
12455             if (isDIGIT(*maxpos)) {
12456                 endptr = RExC_end;
12457                 if (!grok_atoUV(maxpos, &uv, &endptr))
12458                     vFAIL("Invalid quantifier in {,}");
12459                 if (uv >= REG_INFTY)
12460                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12461                 max = (I32)uv;
12462             } else {
12463                 max = REG_INFTY;                /* meaning "infinity" */
12464             }
12465             RExC_parse = next;
12466             nextchar(pRExC_state);
12467             if (max < min) {    /* If can't match, warn and optimize to fail
12468                                    unconditionally */
12469                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12470                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12471                 NEXT_OFF(REGNODE_p(orig_emit)) =
12472                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12473                 return ret;
12474             }
12475             else if (min == max && *RExC_parse == '?')
12476             {
12477                 ckWARN2reg(RExC_parse + 1,
12478                            "Useless use of greediness modifier '%c'",
12479                            *RExC_parse);
12480             }
12481
12482           do_curly:
12483             if ((flags&SIMPLE)) {
12484                 if (min == 0 && max == REG_INFTY) {
12485                     reginsert(pRExC_state, STAR, ret, depth+1);
12486                     MARK_NAUGHTY(4);
12487                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12488                     goto nest_check;
12489                 }
12490                 if (min == 1 && max == REG_INFTY) {
12491                     reginsert(pRExC_state, PLUS, ret, depth+1);
12492                     MARK_NAUGHTY(3);
12493                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12494                     goto nest_check;
12495                 }
12496                 MARK_NAUGHTY_EXP(2, 2);
12497                 reginsert(pRExC_state, CURLY, ret, depth+1);
12498                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12499                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12500             }
12501             else {
12502                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12503
12504                 FLAGS(REGNODE_p(w)) = 0;
12505                 if (!  REGTAIL(pRExC_state, ret, w)) {
12506                     REQUIRE_BRANCHJ(flagp, 0);
12507                 }
12508                 if (RExC_use_BRANCHJ) {
12509                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
12510                     reginsert(pRExC_state, NOTHING, ret, depth+1);
12511                     NEXT_OFF(REGNODE_p(ret)) = 3;       /* Go over LONGJMP. */
12512                 }
12513                 reginsert(pRExC_state, CURLYX, ret, depth+1);
12514                                 /* MJD hk */
12515                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12516                 Set_Node_Length(REGNODE_p(ret),
12517                                 op == '{' ? (RExC_parse - parse_start) : 1);
12518
12519                 if (RExC_use_BRANCHJ)
12520                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12521                                                        LONGJMP. */
12522                 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12523                                                           NOTHING)))
12524                 {
12525                     REQUIRE_BRANCHJ(flagp, 0);
12526                 }
12527                 RExC_whilem_seen++;
12528                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12529             }
12530             FLAGS(REGNODE_p(ret)) = 0;
12531
12532             if (min > 0)
12533                 *flagp = WORST;
12534             if (max > 0)
12535                 *flagp |= HASWIDTH;
12536             ARG1_SET(REGNODE_p(ret), (U16)min);
12537             ARG2_SET(REGNODE_p(ret), (U16)max);
12538             if (max == REG_INFTY)
12539                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12540
12541             goto nest_check;
12542         }
12543     }
12544
12545     if (!ISMULT1(op)) {
12546         *flagp = flags;
12547         return(ret);
12548     }
12549
12550 #if 0                           /* Now runtime fix should be reliable. */
12551
12552     /* if this is reinstated, don't forget to put this back into perldiag:
12553
12554             =item Regexp *+ operand could be empty at {#} in regex m/%s/
12555
12556            (F) The part of the regexp subject to either the * or + quantifier
12557            could match an empty string. The {#} shows in the regular
12558            expression about where the problem was discovered.
12559
12560     */
12561
12562     if (!(flags&HASWIDTH) && op != '?')
12563       vFAIL("Regexp *+ operand could be empty");
12564 #endif
12565
12566 #ifdef RE_TRACK_PATTERN_OFFSETS
12567     parse_start = RExC_parse;
12568 #endif
12569     nextchar(pRExC_state);
12570
12571     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12572
12573     if (op == '*') {
12574         min = 0;
12575         goto do_curly;
12576     }
12577     else if (op == '+') {
12578         min = 1;
12579         goto do_curly;
12580     }
12581     else if (op == '?') {
12582         min = 0; max = 1;
12583         goto do_curly;
12584     }
12585   nest_check:
12586     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12587         ckWARN2reg(RExC_parse,
12588                    "%" UTF8f " matches null string many times",
12589                    UTF8fARG(UTF, (RExC_parse >= origparse
12590                                  ? RExC_parse - origparse
12591                                  : 0),
12592                    origparse));
12593     }
12594
12595     if (*RExC_parse == '?') {
12596         nextchar(pRExC_state);
12597         reginsert(pRExC_state, MINMOD, ret, depth+1);
12598         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12599             REQUIRE_BRANCHJ(flagp, 0);
12600         }
12601     }
12602     else if (*RExC_parse == '+') {
12603         regnode_offset ender;
12604         nextchar(pRExC_state);
12605         ender = reg_node(pRExC_state, SUCCEED);
12606         if (! REGTAIL(pRExC_state, ret, ender)) {
12607             REQUIRE_BRANCHJ(flagp, 0);
12608         }
12609         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12610         ender = reg_node(pRExC_state, TAIL);
12611         if (! REGTAIL(pRExC_state, ret, ender)) {
12612             REQUIRE_BRANCHJ(flagp, 0);
12613         }
12614     }
12615
12616     if (ISMULT2(RExC_parse)) {
12617         RExC_parse++;
12618         vFAIL("Nested quantifiers");
12619     }
12620
12621     return(ret);
12622 }
12623
12624 STATIC bool
12625 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12626                 regnode_offset * node_p,
12627                 UV * code_point_p,
12628                 int * cp_count,
12629                 I32 * flagp,
12630                 const bool strict,
12631                 const U32 depth
12632     )
12633 {
12634  /* This routine teases apart the various meanings of \N and returns
12635   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12636   * in the current context.
12637   *
12638   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12639   *
12640   * If <code_point_p> is not NULL, the context is expecting the result to be a
12641   * single code point.  If this \N instance turns out to a single code point,
12642   * the function returns TRUE and sets *code_point_p to that code point.
12643   *
12644   * If <node_p> is not NULL, the context is expecting the result to be one of
12645   * the things representable by a regnode.  If this \N instance turns out to be
12646   * one such, the function generates the regnode, returns TRUE and sets *node_p
12647   * to point to the offset of that regnode into the regex engine program being
12648   * compiled.
12649   *
12650   * If this instance of \N isn't legal in any context, this function will
12651   * generate a fatal error and not return.
12652   *
12653   * On input, RExC_parse should point to the first char following the \N at the
12654   * time of the call.  On successful return, RExC_parse will have been updated
12655   * to point to just after the sequence identified by this routine.  Also
12656   * *flagp has been updated as needed.
12657   *
12658   * When there is some problem with the current context and this \N instance,
12659   * the function returns FALSE, without advancing RExC_parse, nor setting
12660   * *node_p, nor *code_point_p, nor *flagp.
12661   *
12662   * If <cp_count> is not NULL, the caller wants to know the length (in code
12663   * points) that this \N sequence matches.  This is set, and the input is
12664   * parsed for errors, even if the function returns FALSE, as detailed below.
12665   *
12666   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12667   *
12668   * Probably the most common case is for the \N to specify a single code point.
12669   * *cp_count will be set to 1, and *code_point_p will be set to that code
12670   * point.
12671   *
12672   * Another possibility is for the input to be an empty \N{}.  This is no
12673   * longer accepted, and will generate a fatal error.
12674   *
12675   * Another possibility is for a custom charnames handler to be in effect which
12676   * translates the input name to an empty string.  *cp_count will be set to 0.
12677   * *node_p will be set to a generated NOTHING node.
12678   *
12679   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12680   * set to 0. *node_p will be set to a generated REG_ANY node.
12681   *
12682   * The fifth possibility is that \N resolves to a sequence of more than one
12683   * code points.  *cp_count will be set to the number of code points in the
12684   * sequence. *node_p will be set to a generated node returned by this
12685   * function calling S_reg().
12686   *
12687   * The final possibility is that it is premature to be calling this function;
12688   * the parse needs to be restarted.  This can happen when this changes from
12689   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12690   * latter occurs only when the fifth possibility would otherwise be in
12691   * effect, and is because one of those code points requires the pattern to be
12692   * recompiled as UTF-8.  The function returns FALSE, and sets the
12693   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12694   * happens, the caller needs to desist from continuing parsing, and return
12695   * this information to its caller.  This is not set for when there is only one
12696   * code point, as this can be called as part of an ANYOF node, and they can
12697   * store above-Latin1 code points without the pattern having to be in UTF-8.
12698   *
12699   * For non-single-quoted regexes, the tokenizer has resolved character and
12700   * sequence names inside \N{...} into their Unicode values, normalizing the
12701   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12702   * hex-represented code points in the sequence.  This is done there because
12703   * the names can vary based on what charnames pragma is in scope at the time,
12704   * so we need a way to take a snapshot of what they resolve to at the time of
12705   * the original parse. [perl #56444].
12706   *
12707   * That parsing is skipped for single-quoted regexes, so here we may get
12708   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12709   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12710   * the native character set for non-ASCII platforms.  The other possibilities
12711   * are already native, so no translation is done. */
12712
12713     char * endbrace;    /* points to '}' following the name */
12714     char* p = RExC_parse; /* Temporary */
12715
12716     SV * substitute_parse = NULL;
12717     char *orig_end;
12718     char *save_start;
12719     I32 flags;
12720
12721     GET_RE_DEBUG_FLAGS_DECL;
12722
12723     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12724
12725     GET_RE_DEBUG_FLAGS;
12726
12727     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12728     assert(! (node_p && cp_count));               /* At most 1 should be set */
12729
12730     if (cp_count) {     /* Initialize return for the most common case */
12731         *cp_count = 1;
12732     }
12733
12734     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12735      * modifier.  The other meanings do not, so use a temporary until we find
12736      * out which we are being called with */
12737     skip_to_be_ignored_text(pRExC_state, &p,
12738                             FALSE /* Don't force to /x */ );
12739
12740     /* Disambiguate between \N meaning a named character versus \N meaning
12741      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12742      * quantifier, or if there is no '{' at all */
12743     if (*p != '{' || regcurly(p)) {
12744         RExC_parse = p;
12745         if (cp_count) {
12746             *cp_count = -1;
12747         }
12748
12749         if (! node_p) {
12750             return FALSE;
12751         }
12752
12753         *node_p = reg_node(pRExC_state, REG_ANY);
12754         *flagp |= HASWIDTH|SIMPLE;
12755         MARK_NAUGHTY(1);
12756         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12757         return TRUE;
12758     }
12759
12760     /* The test above made sure that the next real character is a '{', but
12761      * under the /x modifier, it could be separated by space (or a comment and
12762      * \n) and this is not allowed (for consistency with \x{...} and the
12763      * tokenizer handling of \N{NAME}). */
12764     if (*RExC_parse != '{') {
12765         vFAIL("Missing braces on \\N{}");
12766     }
12767
12768     RExC_parse++;       /* Skip past the '{' */
12769
12770     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12771     if (! endbrace) { /* no trailing brace */
12772         vFAIL2("Missing right brace on \\%c{}", 'N');
12773     }
12774
12775     /* Here, we have decided it should be a named character or sequence.  These
12776      * imply Unicode semantics */
12777     REQUIRE_UNI_RULES(flagp, FALSE);
12778
12779     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12780      * nothing at all (not allowed under strict) */
12781     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12782         RExC_parse = endbrace;
12783         if (strict) {
12784             RExC_parse++;   /* Position after the "}" */
12785             vFAIL("Zero length \\N{}");
12786         }
12787
12788         if (cp_count) {
12789             *cp_count = 0;
12790         }
12791         nextchar(pRExC_state);
12792         if (! node_p) {
12793             return FALSE;
12794         }
12795
12796         *node_p = reg_node(pRExC_state, NOTHING);
12797         return TRUE;
12798     }
12799
12800     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
12801
12802         /* Here, the name isn't of the form  U+....  This can happen if the
12803          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
12804          * is the time to find out what the name means */
12805
12806         const STRLEN name_len = endbrace - RExC_parse;
12807         SV *  value_sv;     /* What does this name evaluate to */
12808         SV ** value_svp;
12809         const U8 * value;   /* string of name's value */
12810         STRLEN value_len;   /* and its length */
12811
12812         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
12813          *  toke.c, and their values. Make sure is initialized */
12814         if (! RExC_unlexed_names) {
12815             RExC_unlexed_names = newHV();
12816         }
12817
12818         /* If we have already seen this name in this pattern, use that.  This
12819          * allows us to only call the charnames handler once per name per
12820          * pattern.  A broken or malicious handler could return something
12821          * different each time, which could cause the results to vary depending
12822          * on if something gets added or subtracted from the pattern that
12823          * causes the number of passes to change, for example */
12824         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
12825                                                       name_len, 0)))
12826         {
12827             value_sv = *value_svp;
12828         }
12829         else { /* Otherwise we have to go out and get the name */
12830             const char * error_msg = NULL;
12831             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
12832                                                       UTF,
12833                                                       &error_msg);
12834             if (error_msg) {
12835                 RExC_parse = endbrace;
12836                 vFAIL(error_msg);
12837             }
12838
12839             /* If no error message, should have gotten a valid return */
12840             assert (value_sv);
12841
12842             /* Save the name's meaning for later use */
12843             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
12844                            value_sv, 0))
12845             {
12846                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
12847             }
12848         }
12849
12850         /* Here, we have the value the name evaluates to in 'value_sv' */
12851         value = (U8 *) SvPV(value_sv, value_len);
12852
12853         /* See if the result is one code point vs 0 or multiple */
12854         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
12855                                   ? UTF8SKIP(value)
12856                                   : 1)))
12857         {
12858             /* Here, exactly one code point.  If that isn't what is wanted,
12859              * fail */
12860             if (! code_point_p) {
12861                 RExC_parse = p;
12862                 return FALSE;
12863             }
12864
12865             /* Convert from string to numeric code point */
12866             *code_point_p = (SvUTF8(value_sv))
12867                             ? valid_utf8_to_uvchr(value, NULL)
12868                             : *value;
12869
12870             /* Have parsed this entire single code point \N{...}.  *cp_count
12871              * has already been set to 1, so don't do it again. */
12872             RExC_parse = endbrace;
12873             nextchar(pRExC_state);
12874             return TRUE;
12875         } /* End of is a single code point */
12876
12877         /* Count the code points, if caller desires.  The API says to do this
12878          * even if we will later return FALSE */
12879         if (cp_count) {
12880             *cp_count = 0;
12881
12882             *cp_count = (SvUTF8(value_sv))
12883                         ? utf8_length(value, value + value_len)
12884                         : value_len;
12885         }
12886
12887         /* Fail if caller doesn't want to handle a multi-code-point sequence.
12888          * But don't back the pointer up if the caller wants to know how many
12889          * code points there are (they need to handle it themselves in this
12890          * case).  */
12891         if (! node_p) {
12892             if (! cp_count) {
12893                 RExC_parse = p;
12894             }
12895             return FALSE;
12896         }
12897
12898         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
12899          * reg recursively to parse it.  That way, it retains its atomicness,
12900          * while not having to worry about any special handling that some code
12901          * points may have. */
12902
12903         substitute_parse = newSVpvs("?:");
12904         sv_catsv(substitute_parse, value_sv);
12905         sv_catpv(substitute_parse, ")");
12906
12907         /* The value should already be native, so no need to convert on EBCDIC
12908          * platforms.*/
12909         assert(! RExC_recode_x_to_native);
12910
12911     }
12912     else {   /* \N{U+...} */
12913         Size_t count = 0;   /* code point count kept internally */
12914
12915         /* We can get to here when the input is \N{U+...} or when toke.c has
12916          * converted a name to the \N{U+...} form.  This include changing a
12917          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12918
12919         RExC_parse += 2;    /* Skip past the 'U+' */
12920
12921         /* Code points are separated by dots.  The '}' terminates the whole
12922          * thing. */
12923
12924         do {    /* Loop until the ending brace */
12925             UV cp = 0;
12926             char * start_digit;     /* The first of the current code point */
12927             if (! isXDIGIT(*RExC_parse)) {
12928                 RExC_parse++;
12929                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12930             }
12931
12932             start_digit = RExC_parse;
12933             count++;
12934
12935             /* Loop through the hex digits of the current code point */
12936             do {
12937                 /* Adding this digit will shift the result 4 bits.  If that
12938                  * result would be above the legal max, it's overflow */
12939                 if (cp > MAX_LEGAL_CP >> 4) {
12940
12941                     /* Find the end of the code point */
12942                     do {
12943                         RExC_parse ++;
12944                     } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12945
12946                     /* Be sure to synchronize this message with the similar one
12947                      * in utf8.c */
12948                     vFAIL4("Use of code point 0x%.*s is not allowed; the"
12949                         " permissible max is 0x%" UVxf,
12950                         (int) (RExC_parse - start_digit), start_digit,
12951                         MAX_LEGAL_CP);
12952                 }
12953
12954                 /* Accumulate this (valid) digit into the running total */
12955                 cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
12956
12957                 /* READ_XDIGIT advanced the input pointer.  Ignore a single
12958                  * underscore separator */
12959                 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
12960                     RExC_parse++;
12961                 }
12962             } while (isXDIGIT(*RExC_parse));
12963
12964             /* Here, have accumulated the next code point */
12965             if (RExC_parse >= endbrace) {   /* If done ... */
12966                 if (count != 1) {
12967                     goto do_concat;
12968                 }
12969
12970                 /* Here, is a single code point; fail if doesn't want that */
12971                 if (! code_point_p) {
12972                     RExC_parse = p;
12973                     return FALSE;
12974                 }
12975
12976                 /* A single code point is easy to handle; just return it */
12977                 *code_point_p = UNI_TO_NATIVE(cp);
12978                 RExC_parse = endbrace;
12979                 nextchar(pRExC_state);
12980                 return TRUE;
12981             }
12982
12983             /* Here, the only legal thing would be a multiple character
12984              * sequence (of the form "\N{U+c1.c2. ... }".   So the next
12985              * character must be a dot (and the one after that can't be the
12986              * endbrace, or we'd have something like \N{U+100.} ) */
12987             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
12988                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
12989                                 ? UTF8SKIP(RExC_parse)
12990                                 : 1;
12991                 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
12992                     RExC_parse = endbrace;
12993                 }
12994                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12995             }
12996
12997             /* Here, looks like its really a multiple character sequence.  Fail
12998              * if that's not what the caller wants.  But continue with counting
12999              * and error checking if they still want a count */
13000             if (! node_p && ! cp_count) {
13001                 return FALSE;
13002             }
13003
13004             /* What is done here is to convert this to a sub-pattern of the
13005              * form \x{char1}\x{char2}...  and then call reg recursively to
13006              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13007              * atomicness, while not having to worry about special handling
13008              * that some code points may have.  We don't create a subpattern,
13009              * but go through the motions of code point counting and error
13010              * checking, if the caller doesn't want a node returned. */
13011
13012             if (node_p && count == 1) {
13013                 substitute_parse = newSVpvs("?:");
13014             }
13015
13016           do_concat:
13017
13018             if (node_p) {
13019                 /* Convert to notation the rest of the code understands */
13020                 sv_catpvs(substitute_parse, "\\x{");
13021                 sv_catpvn(substitute_parse, start_digit,
13022                                             RExC_parse - start_digit);
13023                 sv_catpvs(substitute_parse, "}");
13024             }
13025
13026             /* Move to after the dot (or ending brace the final time through.)
13027              * */
13028             RExC_parse++;
13029             count++;
13030
13031         } while (RExC_parse < endbrace);
13032
13033         if (! node_p) { /* Doesn't want the node */
13034             assert (cp_count);
13035
13036             *cp_count = count;
13037             return FALSE;
13038         }
13039
13040         sv_catpvs(substitute_parse, ")");
13041
13042         /* The values are Unicode, and therefore have to be converted to native
13043          * on a non-Unicode (meaning non-ASCII) platform. */
13044         SET_recode_x_to_native(1);
13045     }
13046
13047     /* Here, we have the string the name evaluates to, ready to be parsed,
13048      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13049      * constructs.  This can be called from within a substitute parse already.
13050      * The error reporting mechanism doesn't work for 2 levels of this, but the
13051      * code above has validated this new construct, so there should be no
13052      * errors generated by the below.  And this isn' an exact copy, so the
13053      * mechanism to seamlessly deal with this won't work, so turn off warnings
13054      * during it */
13055     save_start = RExC_start;
13056     orig_end = RExC_end;
13057
13058     RExC_parse = RExC_start = SvPVX(substitute_parse);
13059     RExC_end = RExC_parse + SvCUR(substitute_parse);
13060     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13061
13062     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13063
13064     /* Restore the saved values */
13065     RESTORE_WARNINGS;
13066     RExC_start = save_start;
13067     RExC_parse = endbrace;
13068     RExC_end = orig_end;
13069     SET_recode_x_to_native(0);
13070
13071     SvREFCNT_dec_NN(substitute_parse);
13072
13073     if (! *node_p) {
13074         RETURN_FAIL_ON_RESTART(flags, flagp);
13075         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13076             (UV) flags);
13077     }
13078     *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13079
13080     nextchar(pRExC_state);
13081
13082     return TRUE;
13083 }
13084
13085
13086 PERL_STATIC_INLINE U8
13087 S_compute_EXACTish(RExC_state_t *pRExC_state)
13088 {
13089     U8 op;
13090
13091     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13092
13093     if (! FOLD) {
13094         return (LOC)
13095                 ? EXACTL
13096                 : EXACT;
13097     }
13098
13099     op = get_regex_charset(RExC_flags);
13100     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13101         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13102                  been, so there is no hole */
13103     }
13104
13105     return op + EXACTF;
13106 }
13107
13108 STATIC bool
13109 S_new_regcurly(const char *s, const char *e)
13110 {
13111     /* This is a temporary function designed to match the most lenient form of
13112      * a {m,n} quantifier we ever envision, with either number omitted, and
13113      * spaces anywhere between/before/after them.
13114      *
13115      * If this function fails, then the string it matches is very unlikely to
13116      * ever be considered a valid quantifier, so we can allow the '{' that
13117      * begins it to be considered as a literal */
13118
13119     bool has_min = FALSE;
13120     bool has_max = FALSE;
13121
13122     PERL_ARGS_ASSERT_NEW_REGCURLY;
13123
13124     if (s >= e || *s++ != '{')
13125         return FALSE;
13126
13127     while (s < e && isSPACE(*s)) {
13128         s++;
13129     }
13130     while (s < e && isDIGIT(*s)) {
13131         has_min = TRUE;
13132         s++;
13133     }
13134     while (s < e && isSPACE(*s)) {
13135         s++;
13136     }
13137
13138     if (*s == ',') {
13139         s++;
13140         while (s < e && isSPACE(*s)) {
13141             s++;
13142         }
13143         while (s < e && isDIGIT(*s)) {
13144             has_max = TRUE;
13145             s++;
13146         }
13147         while (s < e && isSPACE(*s)) {
13148             s++;
13149         }
13150     }
13151
13152     return s < e && *s == '}' && (has_min || has_max);
13153 }
13154
13155 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13156  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13157
13158 static I32
13159 S_backref_value(char *p, char *e)
13160 {
13161     const char* endptr = e;
13162     UV val;
13163     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13164         return (I32)val;
13165     return I32_MAX;
13166 }
13167
13168
13169 /*
13170  - regatom - the lowest level
13171
13172    Try to identify anything special at the start of the current parse position.
13173    If there is, then handle it as required. This may involve generating a
13174    single regop, such as for an assertion; or it may involve recursing, such as
13175    to handle a () structure.
13176
13177    If the string doesn't start with something special then we gobble up
13178    as much literal text as we can.  If we encounter a quantifier, we have to
13179    back off the final literal character, as that quantifier applies to just it
13180    and not to the whole string of literals.
13181
13182    Once we have been able to handle whatever type of thing started the
13183    sequence, we return the offset into the regex engine program being compiled
13184    at which any  next regnode should be placed.
13185
13186    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13187    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13188    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13189    Otherwise does not return 0.
13190
13191    Note: we have to be careful with escapes, as they can be both literal
13192    and special, and in the case of \10 and friends, context determines which.
13193
13194    A summary of the code structure is:
13195
13196    switch (first_byte) {
13197         cases for each special:
13198             handle this special;
13199             break;
13200         case '\\':
13201             switch (2nd byte) {
13202                 cases for each unambiguous special:
13203                     handle this special;
13204                     break;
13205                 cases for each ambigous special/literal:
13206                     disambiguate;
13207                     if (special)  handle here
13208                     else goto defchar;
13209                 default: // unambiguously literal:
13210                     goto defchar;
13211             }
13212         default:  // is a literal char
13213             // FALL THROUGH
13214         defchar:
13215             create EXACTish node for literal;
13216             while (more input and node isn't full) {
13217                 switch (input_byte) {
13218                    cases for each special;
13219                        make sure parse pointer is set so that the next call to
13220                            regatom will see this special first
13221                        goto loopdone; // EXACTish node terminated by prev. char
13222                    default:
13223                        append char to EXACTISH node;
13224                 }
13225                 get next input byte;
13226             }
13227         loopdone:
13228    }
13229    return the generated node;
13230
13231    Specifically there are two separate switches for handling
13232    escape sequences, with the one for handling literal escapes requiring
13233    a dummy entry for all of the special escapes that are actually handled
13234    by the other.
13235
13236 */
13237
13238 STATIC regnode_offset
13239 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13240 {
13241     dVAR;
13242     regnode_offset ret = 0;
13243     I32 flags = 0;
13244     char *parse_start;
13245     U8 op;
13246     int invert = 0;
13247
13248     GET_RE_DEBUG_FLAGS_DECL;
13249
13250     *flagp = WORST;             /* Tentatively. */
13251
13252     DEBUG_PARSE("atom");
13253
13254     PERL_ARGS_ASSERT_REGATOM;
13255
13256   tryagain:
13257     parse_start = RExC_parse;
13258     assert(RExC_parse < RExC_end);
13259     switch ((U8)*RExC_parse) {
13260     case '^':
13261         RExC_seen_zerolen++;
13262         nextchar(pRExC_state);
13263         if (RExC_flags & RXf_PMf_MULTILINE)
13264             ret = reg_node(pRExC_state, MBOL);
13265         else
13266             ret = reg_node(pRExC_state, SBOL);
13267         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13268         break;
13269     case '$':
13270         nextchar(pRExC_state);
13271         if (*RExC_parse)
13272             RExC_seen_zerolen++;
13273         if (RExC_flags & RXf_PMf_MULTILINE)
13274             ret = reg_node(pRExC_state, MEOL);
13275         else
13276             ret = reg_node(pRExC_state, SEOL);
13277         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13278         break;
13279     case '.':
13280         nextchar(pRExC_state);
13281         if (RExC_flags & RXf_PMf_SINGLELINE)
13282             ret = reg_node(pRExC_state, SANY);
13283         else
13284             ret = reg_node(pRExC_state, REG_ANY);
13285         *flagp |= HASWIDTH|SIMPLE;
13286         MARK_NAUGHTY(1);
13287         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13288         break;
13289     case '[':
13290     {
13291         char * const oregcomp_parse = ++RExC_parse;
13292         ret = regclass(pRExC_state, flagp, depth+1,
13293                        FALSE, /* means parse the whole char class */
13294                        TRUE, /* allow multi-char folds */
13295                        FALSE, /* don't silence non-portable warnings. */
13296                        (bool) RExC_strict,
13297                        TRUE, /* Allow an optimized regnode result */
13298                        NULL);
13299         if (ret == 0) {
13300             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13301             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13302                   (UV) *flagp);
13303         }
13304         if (*RExC_parse != ']') {
13305             RExC_parse = oregcomp_parse;
13306             vFAIL("Unmatched [");
13307         }
13308         nextchar(pRExC_state);
13309         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13310         break;
13311     }
13312     case '(':
13313         nextchar(pRExC_state);
13314         ret = reg(pRExC_state, 2, &flags, depth+1);
13315         if (ret == 0) {
13316                 if (flags & TRYAGAIN) {
13317                     if (RExC_parse >= RExC_end) {
13318                          /* Make parent create an empty node if needed. */
13319                         *flagp |= TRYAGAIN;
13320                         return(0);
13321                     }
13322                     goto tryagain;
13323                 }
13324                 RETURN_FAIL_ON_RESTART(flags, flagp);
13325                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13326                                                                  (UV) flags);
13327         }
13328         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13329         break;
13330     case '|':
13331     case ')':
13332         if (flags & TRYAGAIN) {
13333             *flagp |= TRYAGAIN;
13334             return 0;
13335         }
13336         vFAIL("Internal urp");
13337                                 /* Supposed to be caught earlier. */
13338         break;
13339     case '?':
13340     case '+':
13341     case '*':
13342         RExC_parse++;
13343         vFAIL("Quantifier follows nothing");
13344         break;
13345     case '\\':
13346         /* Special Escapes
13347
13348            This switch handles escape sequences that resolve to some kind
13349            of special regop and not to literal text. Escape sequences that
13350            resolve to literal text are handled below in the switch marked
13351            "Literal Escapes".
13352
13353            Every entry in this switch *must* have a corresponding entry
13354            in the literal escape switch. However, the opposite is not
13355            required, as the default for this switch is to jump to the
13356            literal text handling code.
13357         */
13358         RExC_parse++;
13359         switch ((U8)*RExC_parse) {
13360         /* Special Escapes */
13361         case 'A':
13362             RExC_seen_zerolen++;
13363             ret = reg_node(pRExC_state, SBOL);
13364             /* SBOL is shared with /^/ so we set the flags so we can tell
13365              * /\A/ from /^/ in split. */
13366             FLAGS(REGNODE_p(ret)) = 1;
13367             *flagp |= SIMPLE;
13368             goto finish_meta_pat;
13369         case 'G':
13370             ret = reg_node(pRExC_state, GPOS);
13371             RExC_seen |= REG_GPOS_SEEN;
13372             *flagp |= SIMPLE;
13373             goto finish_meta_pat;
13374         case 'K':
13375             if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13376                 RExC_seen_zerolen++;
13377                 ret = reg_node(pRExC_state, KEEPS);
13378                 *flagp |= SIMPLE;
13379                 /* XXX:dmq : disabling in-place substitution seems to
13380                  * be necessary here to avoid cases of memory corruption, as
13381                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13382                  */
13383                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13384                 goto finish_meta_pat;
13385             }
13386             else {
13387                 ++RExC_parse; /* advance past the 'K' */
13388                 vFAIL("\\K not permitted in lookahead/lookbehind");
13389             }
13390         case 'Z':
13391             ret = reg_node(pRExC_state, SEOL);
13392             *flagp |= SIMPLE;
13393             RExC_seen_zerolen++;                /* Do not optimize RE away */
13394             goto finish_meta_pat;
13395         case 'z':
13396             ret = reg_node(pRExC_state, EOS);
13397             *flagp |= SIMPLE;
13398             RExC_seen_zerolen++;                /* Do not optimize RE away */
13399             goto finish_meta_pat;
13400         case 'C':
13401             vFAIL("\\C no longer supported");
13402         case 'X':
13403             ret = reg_node(pRExC_state, CLUMP);
13404             *flagp |= HASWIDTH;
13405             goto finish_meta_pat;
13406
13407         case 'B':
13408             invert = 1;
13409             /* FALLTHROUGH */
13410         case 'b':
13411           {
13412             U8 flags = 0;
13413             regex_charset charset = get_regex_charset(RExC_flags);
13414
13415             RExC_seen_zerolen++;
13416             RExC_seen |= REG_LOOKBEHIND_SEEN;
13417             op = BOUND + charset;
13418
13419             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13420                 flags = TRADITIONAL_BOUND;
13421                 if (op > BOUNDA) {  /* /aa is same as /a */
13422                     op = BOUNDA;
13423                 }
13424             }
13425             else {
13426                 STRLEN length;
13427                 char name = *RExC_parse;
13428                 char * endbrace = NULL;
13429                 RExC_parse += 2;
13430                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13431
13432                 if (! endbrace) {
13433                     vFAIL2("Missing right brace on \\%c{}", name);
13434                 }
13435                 /* XXX Need to decide whether to take spaces or not.  Should be
13436                  * consistent with \p{}, but that currently is SPACE, which
13437                  * means vertical too, which seems wrong
13438                  * while (isBLANK(*RExC_parse)) {
13439                     RExC_parse++;
13440                 }*/
13441                 if (endbrace == RExC_parse) {
13442                     RExC_parse++;  /* After the '}' */
13443                     vFAIL2("Empty \\%c{}", name);
13444                 }
13445                 length = endbrace - RExC_parse;
13446                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13447                     length--;
13448                 }*/
13449                 switch (*RExC_parse) {
13450                     case 'g':
13451                         if (    length != 1
13452                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13453                         {
13454                             goto bad_bound_type;
13455                         }
13456                         flags = GCB_BOUND;
13457                         break;
13458                     case 'l':
13459                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13460                             goto bad_bound_type;
13461                         }
13462                         flags = LB_BOUND;
13463                         break;
13464                     case 's':
13465                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13466                             goto bad_bound_type;
13467                         }
13468                         flags = SB_BOUND;
13469                         break;
13470                     case 'w':
13471                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13472                             goto bad_bound_type;
13473                         }
13474                         flags = WB_BOUND;
13475                         break;
13476                     default:
13477                       bad_bound_type:
13478                         RExC_parse = endbrace;
13479                         vFAIL2utf8f(
13480                             "'%" UTF8f "' is an unknown bound type",
13481                             UTF8fARG(UTF, length, endbrace - length));
13482                         NOT_REACHED; /*NOTREACHED*/
13483                 }
13484                 RExC_parse = endbrace;
13485                 REQUIRE_UNI_RULES(flagp, 0);
13486
13487                 if (op == BOUND) {
13488                     op = BOUNDU;
13489                 }
13490                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13491                     op = BOUNDU;
13492                     length += 4;
13493
13494                     /* Don't have to worry about UTF-8, in this message because
13495                      * to get here the contents of the \b must be ASCII */
13496                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13497                               "Using /u for '%.*s' instead of /%s",
13498                               (unsigned) length,
13499                               endbrace - length + 1,
13500                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13501                               ? ASCII_RESTRICT_PAT_MODS
13502                               : ASCII_MORE_RESTRICT_PAT_MODS);
13503                 }
13504             }
13505
13506             if (op == BOUND) {
13507                 RExC_seen_d_op = TRUE;
13508             }
13509             else if (op == BOUNDL) {
13510                 RExC_contains_locale = 1;
13511             }
13512
13513             if (invert) {
13514                 op += NBOUND - BOUND;
13515             }
13516
13517             ret = reg_node(pRExC_state, op);
13518             FLAGS(REGNODE_p(ret)) = flags;
13519
13520             *flagp |= SIMPLE;
13521
13522             goto finish_meta_pat;
13523           }
13524
13525         case 'R':
13526             ret = reg_node(pRExC_state, LNBREAK);
13527             *flagp |= HASWIDTH|SIMPLE;
13528             goto finish_meta_pat;
13529
13530         case 'd':
13531         case 'D':
13532         case 'h':
13533         case 'H':
13534         case 'p':
13535         case 'P':
13536         case 's':
13537         case 'S':
13538         case 'v':
13539         case 'V':
13540         case 'w':
13541         case 'W':
13542             /* These all have the same meaning inside [brackets], and it knows
13543              * how to do the best optimizations for them.  So, pretend we found
13544              * these within brackets, and let it do the work */
13545             RExC_parse--;
13546
13547             ret = regclass(pRExC_state, flagp, depth+1,
13548                            TRUE, /* means just parse this element */
13549                            FALSE, /* don't allow multi-char folds */
13550                            FALSE, /* don't silence non-portable warnings.  It
13551                                      would be a bug if these returned
13552                                      non-portables */
13553                            (bool) RExC_strict,
13554                            TRUE, /* Allow an optimized regnode result */
13555                            NULL);
13556             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13557             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13558              * multi-char folds are allowed.  */
13559             if (!ret)
13560                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13561                       (UV) *flagp);
13562
13563             RExC_parse--;   /* regclass() leaves this one too far ahead */
13564
13565           finish_meta_pat:
13566                    /* The escapes above that don't take a parameter can't be
13567                     * followed by a '{'.  But 'pX', 'p{foo}' and
13568                     * correspondingly 'P' can be */
13569             if (   RExC_parse - parse_start == 1
13570                 && UCHARAT(RExC_parse + 1) == '{'
13571                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13572             {
13573                 RExC_parse += 2;
13574                 vFAIL("Unescaped left brace in regex is illegal here");
13575             }
13576             Set_Node_Offset(REGNODE_p(ret), parse_start);
13577             Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13578             nextchar(pRExC_state);
13579             break;
13580         case 'N':
13581             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13582              * \N{...} evaluates to a sequence of more than one code points).
13583              * The function call below returns a regnode, which is our result.
13584              * The parameters cause it to fail if the \N{} evaluates to a
13585              * single code point; we handle those like any other literal.  The
13586              * reason that the multicharacter case is handled here and not as
13587              * part of the EXACtish code is because of quantifiers.  In
13588              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13589              * this way makes that Just Happen. dmq.
13590              * join_exact() will join this up with adjacent EXACTish nodes
13591              * later on, if appropriate. */
13592             ++RExC_parse;
13593             if (grok_bslash_N(pRExC_state,
13594                               &ret,     /* Want a regnode returned */
13595                               NULL,     /* Fail if evaluates to a single code
13596                                            point */
13597                               NULL,     /* Don't need a count of how many code
13598                                            points */
13599                               flagp,
13600                               RExC_strict,
13601                               depth)
13602             ) {
13603                 break;
13604             }
13605
13606             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13607
13608             /* Here, evaluates to a single code point.  Go get that */
13609             RExC_parse = parse_start;
13610             goto defchar;
13611
13612         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13613       parse_named_seq:
13614         {
13615             char ch;
13616             if (   RExC_parse >= RExC_end - 1
13617                 || ((   ch = RExC_parse[1]) != '<'
13618                                       && ch != '\''
13619                                       && ch != '{'))
13620             {
13621                 RExC_parse++;
13622                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13623                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13624             } else {
13625                 RExC_parse += 2;
13626                 ret = handle_named_backref(pRExC_state,
13627                                            flagp,
13628                                            parse_start,
13629                                            (ch == '<')
13630                                            ? '>'
13631                                            : (ch == '{')
13632                                              ? '}'
13633                                              : '\'');
13634             }
13635             break;
13636         }
13637         case 'g':
13638         case '1': case '2': case '3': case '4':
13639         case '5': case '6': case '7': case '8': case '9':
13640             {
13641                 I32 num;
13642                 bool hasbrace = 0;
13643
13644                 if (*RExC_parse == 'g') {
13645                     bool isrel = 0;
13646
13647                     RExC_parse++;
13648                     if (*RExC_parse == '{') {
13649                         RExC_parse++;
13650                         hasbrace = 1;
13651                     }
13652                     if (*RExC_parse == '-') {
13653                         RExC_parse++;
13654                         isrel = 1;
13655                     }
13656                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13657                         if (isrel) RExC_parse--;
13658                         RExC_parse -= 2;
13659                         goto parse_named_seq;
13660                     }
13661
13662                     if (RExC_parse >= RExC_end) {
13663                         goto unterminated_g;
13664                     }
13665                     num = S_backref_value(RExC_parse, RExC_end);
13666                     if (num == 0)
13667                         vFAIL("Reference to invalid group 0");
13668                     else if (num == I32_MAX) {
13669                          if (isDIGIT(*RExC_parse))
13670                             vFAIL("Reference to nonexistent group");
13671                         else
13672                           unterminated_g:
13673                             vFAIL("Unterminated \\g... pattern");
13674                     }
13675
13676                     if (isrel) {
13677                         num = RExC_npar - num;
13678                         if (num < 1)
13679                             vFAIL("Reference to nonexistent or unclosed group");
13680                     }
13681                 }
13682                 else {
13683                     num = S_backref_value(RExC_parse, RExC_end);
13684                     /* bare \NNN might be backref or octal - if it is larger
13685                      * than or equal RExC_npar then it is assumed to be an
13686                      * octal escape. Note RExC_npar is +1 from the actual
13687                      * number of parens. */
13688                     /* Note we do NOT check if num == I32_MAX here, as that is
13689                      * handled by the RExC_npar check */
13690
13691                     if (
13692                         /* any numeric escape < 10 is always a backref */
13693                         num > 9
13694                         /* any numeric escape < RExC_npar is a backref */
13695                         && num >= RExC_npar
13696                         /* cannot be an octal escape if it starts with 8 */
13697                         && *RExC_parse != '8'
13698                         /* cannot be an octal escape if it starts with 9 */
13699                         && *RExC_parse != '9'
13700                     ) {
13701                         /* Probably not meant to be a backref, instead likely
13702                          * to be an octal character escape, e.g. \35 or \777.
13703                          * The above logic should make it obvious why using
13704                          * octal escapes in patterns is problematic. - Yves */
13705                         RExC_parse = parse_start;
13706                         goto defchar;
13707                     }
13708                 }
13709
13710                 /* At this point RExC_parse points at a numeric escape like
13711                  * \12 or \88 or something similar, which we should NOT treat
13712                  * as an octal escape. It may or may not be a valid backref
13713                  * escape. For instance \88888888 is unlikely to be a valid
13714                  * backref. */
13715                 while (isDIGIT(*RExC_parse))
13716                     RExC_parse++;
13717                 if (hasbrace) {
13718                     if (*RExC_parse != '}')
13719                         vFAIL("Unterminated \\g{...} pattern");
13720                     RExC_parse++;
13721                 }
13722                 if (num >= (I32)RExC_npar) {
13723
13724                     /* It might be a forward reference; we can't fail until we
13725                      * know, by completing the parse to get all the groups, and
13726                      * then reparsing */
13727                     if (ALL_PARENS_COUNTED)  {
13728                         if (num >= RExC_total_parens)  {
13729                             vFAIL("Reference to nonexistent group");
13730                         }
13731                     }
13732                     else {
13733                         REQUIRE_PARENS_PASS;
13734                     }
13735                 }
13736                 RExC_sawback = 1;
13737                 ret = reganode(pRExC_state,
13738                                ((! FOLD)
13739                                  ? REF
13740                                  : (ASCII_FOLD_RESTRICTED)
13741                                    ? REFFA
13742                                    : (AT_LEAST_UNI_SEMANTICS)
13743                                      ? REFFU
13744                                      : (LOC)
13745                                        ? REFFL
13746                                        : REFF),
13747                                 num);
13748                 if (OP(REGNODE_p(ret)) == REFF) {
13749                     RExC_seen_d_op = TRUE;
13750                 }
13751                 *flagp |= HASWIDTH;
13752
13753                 /* override incorrect value set in reganode MJD */
13754                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13755                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13756                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13757                                         FALSE /* Don't force to /x */ );
13758             }
13759             break;
13760         case '\0':
13761             if (RExC_parse >= RExC_end)
13762                 FAIL("Trailing \\");
13763             /* FALLTHROUGH */
13764         default:
13765             /* Do not generate "unrecognized" warnings here, we fall
13766                back into the quick-grab loop below */
13767             RExC_parse = parse_start;
13768             goto defchar;
13769         } /* end of switch on a \foo sequence */
13770         break;
13771
13772     case '#':
13773
13774         /* '#' comments should have been spaced over before this function was
13775          * called */
13776         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13777         /*
13778         if (RExC_flags & RXf_PMf_EXTENDED) {
13779             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13780             if (RExC_parse < RExC_end)
13781                 goto tryagain;
13782         }
13783         */
13784
13785         /* FALLTHROUGH */
13786
13787     default:
13788           defchar: {
13789
13790             /* Here, we have determined that the next thing is probably a
13791              * literal character.  RExC_parse points to the first byte of its
13792              * definition.  (It still may be an escape sequence that evaluates
13793              * to a single character) */
13794
13795             STRLEN len = 0;
13796             UV ender = 0;
13797             char *p;
13798             char *s, *old_s = NULL, *old_old_s = NULL;
13799             char *s0;
13800             U32 max_string_len = 255;
13801
13802             /* We may have to reparse the node, artificially stopping filling
13803              * it early, based on info gleaned in the first parse.  This
13804              * variable gives where we stop.  Make it above the normal stopping
13805              * place first time through; otherwise it would stop too early */
13806             U32 upper_fill = max_string_len + 1;
13807
13808             /* We start out as an EXACT node, even if under /i, until we find a
13809              * character which is in a fold.  The algorithm now segregates into
13810              * separate nodes, characters that fold from those that don't under
13811              * /i.  (This hopefully will create nodes that are fixed strings
13812              * even under /i, giving the optimizer something to grab on to.)
13813              * So, if a node has something in it and the next character is in
13814              * the opposite category, that node is closed up, and the function
13815              * returns.  Then regatom is called again, and a new node is
13816              * created for the new category. */
13817             U8 node_type = EXACT;
13818
13819             /* Assume the node will be fully used; the excess is given back at
13820              * the end.  Under /i, we may need to temporarily add the fold of
13821              * an extra character or two at the end to check for splitting
13822              * multi-char folds, so allocate extra space for that.   We can't
13823              * make any other length assumptions, as a byte input sequence
13824              * could shrink down. */
13825             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
13826                                                  + ((! FOLD)
13827                                                     ? 0
13828                                                     : 2 * ((UTF)
13829                                                            ? UTF8_MAXBYTES_CASE
13830                         /* Max non-UTF-8 expansion is 2 */ : 2)));
13831
13832             bool next_is_quantifier;
13833             char * oldp = NULL;
13834
13835             /* We can convert EXACTF nodes to EXACTFU if they contain only
13836              * characters that match identically regardless of the target
13837              * string's UTF8ness.  The reason to do this is that EXACTF is not
13838              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
13839              * runtime.
13840              *
13841              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13842              * contain only above-Latin1 characters (hence must be in UTF8),
13843              * which don't participate in folds with Latin1-range characters,
13844              * as the latter's folds aren't known until runtime. */
13845             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13846
13847             /* Single-character EXACTish nodes are almost always SIMPLE.  This
13848              * allows us to override this as encountered */
13849             U8 maybe_SIMPLE = SIMPLE;
13850
13851             /* Does this node contain something that can't match unless the
13852              * target string is (also) in UTF-8 */
13853             bool requires_utf8_target = FALSE;
13854
13855             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
13856             bool has_ss = FALSE;
13857
13858             /* So is the MICRO SIGN */
13859             bool has_micro_sign = FALSE;
13860
13861             /* Set when we fill up the current node and there is still more
13862              * text to process */
13863             bool overflowed;
13864
13865             /* Allocate an EXACT node.  The node_type may change below to
13866              * another EXACTish node, but since the size of the node doesn't
13867              * change, it works */
13868             ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
13869                                                                     "exact");
13870             FILL_NODE(ret, node_type);
13871             RExC_emit++;
13872
13873             s = STRING(REGNODE_p(ret));
13874
13875             s0 = s;
13876
13877           reparse:
13878
13879             p = RExC_parse;
13880             len = 0;
13881             s = s0;
13882             node_type = EXACT;
13883             oldp = NULL;
13884             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13885             maybe_SIMPLE = SIMPLE;
13886             requires_utf8_target = FALSE;
13887             has_ss = FALSE;
13888             has_micro_sign = FALSE;
13889
13890           continue_parse:
13891
13892             /* This breaks under rare circumstances.  If folding, we do not
13893              * want to split a node at a character that is a non-final in a
13894              * multi-char fold, as an input string could just happen to want to
13895              * match across the node boundary.  The code at the end of the loop
13896              * looks for this, and backs off until it finds not such a
13897              * character, but it is possible (though extremely, extremely
13898              * unlikely) for all characters in the node to be non-final fold
13899              * ones, in which case we just leave the node fully filled, and
13900              * hope that it doesn't match the string in just the wrong place */
13901
13902             assert( ! UTF     /* Is at the beginning of a character */
13903                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13904                    || UTF8_IS_START(UCHARAT(RExC_parse)));
13905
13906             overflowed = FALSE;
13907
13908             /* Here, we have a literal character.  Find the maximal string of
13909              * them in the input that we can fit into a single EXACTish node.
13910              * We quit at the first non-literal or when the node gets full, or
13911              * under /i the categorization of folding/non-folding character
13912              * changes */
13913             while (p < RExC_end && len < upper_fill) {
13914
13915                 /* In most cases each iteration adds one byte to the output.
13916                  * The exceptions override this */
13917                 Size_t added_len = 1;
13918
13919                 oldp = p;
13920                 old_old_s = old_s;
13921                 old_s = s;
13922
13923                 /* White space has already been ignored */
13924                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
13925                        || ! is_PATWS_safe((p), RExC_end, UTF));
13926
13927                 switch ((U8)*p) {
13928                 case '^':
13929                 case '$':
13930                 case '.':
13931                 case '[':
13932                 case '(':
13933                 case ')':
13934                 case '|':
13935                     goto loopdone;
13936                 case '\\':
13937                     /* Literal Escapes Switch
13938
13939                        This switch is meant to handle escape sequences that
13940                        resolve to a literal character.
13941
13942                        Every escape sequence that represents something
13943                        else, like an assertion or a char class, is handled
13944                        in the switch marked 'Special Escapes' above in this
13945                        routine, but also has an entry here as anything that
13946                        isn't explicitly mentioned here will be treated as
13947                        an unescaped equivalent literal.
13948                     */
13949
13950                     switch ((U8)*++p) {
13951
13952                     /* These are all the special escapes. */
13953                     case 'A':             /* Start assertion */
13954                     case 'b': case 'B':   /* Word-boundary assertion*/
13955                     case 'C':             /* Single char !DANGEROUS! */
13956                     case 'd': case 'D':   /* digit class */
13957                     case 'g': case 'G':   /* generic-backref, pos assertion */
13958                     case 'h': case 'H':   /* HORIZWS */
13959                     case 'k': case 'K':   /* named backref, keep marker */
13960                     case 'p': case 'P':   /* Unicode property */
13961                               case 'R':   /* LNBREAK */
13962                     case 's': case 'S':   /* space class */
13963                     case 'v': case 'V':   /* VERTWS */
13964                     case 'w': case 'W':   /* word class */
13965                     case 'X':             /* eXtended Unicode "combining
13966                                              character sequence" */
13967                     case 'z': case 'Z':   /* End of line/string assertion */
13968                         --p;
13969                         goto loopdone;
13970
13971                     /* Anything after here is an escape that resolves to a
13972                        literal. (Except digits, which may or may not)
13973                      */
13974                     case 'n':
13975                         ender = '\n';
13976                         p++;
13977                         break;
13978                     case 'N': /* Handle a single-code point named character. */
13979                         RExC_parse = p + 1;
13980                         if (! grok_bslash_N(pRExC_state,
13981                                             NULL,   /* Fail if evaluates to
13982                                                        anything other than a
13983                                                        single code point */
13984                                             &ender, /* The returned single code
13985                                                        point */
13986                                             NULL,   /* Don't need a count of
13987                                                        how many code points */
13988                                             flagp,
13989                                             RExC_strict,
13990                                             depth)
13991                         ) {
13992                             if (*flagp & NEED_UTF8)
13993                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
13994                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13995
13996                             /* Here, it wasn't a single code point.  Go close
13997                              * up this EXACTish node.  The switch() prior to
13998                              * this switch handles the other cases */
13999                             RExC_parse = p = oldp;
14000                             goto loopdone;
14001                         }
14002                         p = RExC_parse;
14003                         RExC_parse = parse_start;
14004
14005                         /* The \N{} means the pattern, if previously /d,
14006                          * becomes /u.  That means it can't be an EXACTF node,
14007                          * but an EXACTFU */
14008                         if (node_type == EXACTF) {
14009                             node_type = EXACTFU;
14010
14011                             /* If the node already contains something that
14012                              * differs between EXACTF and EXACTFU, reparse it
14013                              * as EXACTFU */
14014                             if (! maybe_exactfu) {
14015                                 len = 0;
14016                                 s = s0;
14017                                 goto reparse;
14018                             }
14019                         }
14020
14021                         break;
14022                     case 'r':
14023                         ender = '\r';
14024                         p++;
14025                         break;
14026                     case 't':
14027                         ender = '\t';
14028                         p++;
14029                         break;
14030                     case 'f':
14031                         ender = '\f';
14032                         p++;
14033                         break;
14034                     case 'e':
14035                         ender = ESC_NATIVE;
14036                         p++;
14037                         break;
14038                     case 'a':
14039                         ender = '\a';
14040                         p++;
14041                         break;
14042                     case 'o':
14043                         {
14044                             UV result;
14045                             const char* error_msg;
14046
14047                             bool valid = grok_bslash_o(&p,
14048                                                        RExC_end,
14049                                                        &result,
14050                                                        &error_msg,
14051                                                        TO_OUTPUT_WARNINGS(p),
14052                                                        (bool) RExC_strict,
14053                                                        TRUE, /* Output warnings
14054                                                                 for non-
14055                                                                 portables */
14056                                                        UTF);
14057                             if (! valid) {
14058                                 RExC_parse = p; /* going to die anyway; point
14059                                                    to exact spot of failure */
14060                                 vFAIL(error_msg);
14061                             }
14062                             UPDATE_WARNINGS_LOC(p - 1);
14063                             ender = result;
14064                             break;
14065                         }
14066                     case 'x':
14067                         {
14068                             UV result = UV_MAX; /* initialize to erroneous
14069                                                    value */
14070                             const char* error_msg;
14071
14072                             bool valid = grok_bslash_x(&p,
14073                                                        RExC_end,
14074                                                        &result,
14075                                                        &error_msg,
14076                                                        TO_OUTPUT_WARNINGS(p),
14077                                                        (bool) RExC_strict,
14078                                                        TRUE, /* Silence warnings
14079                                                                 for non-
14080                                                                 portables */
14081                                                        UTF);
14082                             if (! valid) {
14083                                 RExC_parse = p; /* going to die anyway; point
14084                                                    to exact spot of failure */
14085                                 vFAIL(error_msg);
14086                             }
14087                             UPDATE_WARNINGS_LOC(p - 1);
14088                             ender = result;
14089
14090 #ifdef EBCDIC
14091                             if (ender < 0x100) {
14092                                 if (RExC_recode_x_to_native) {
14093                                     ender = LATIN1_TO_NATIVE(ender);
14094                                 }
14095                             }
14096 #endif
14097                             break;
14098                         }
14099                     case 'c':
14100                         p++;
14101                         ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14102                         UPDATE_WARNINGS_LOC(p);
14103                         p++;
14104                         break;
14105                     case '8': case '9': /* must be a backreference */
14106                         --p;
14107                         /* we have an escape like \8 which cannot be an octal escape
14108                          * so we exit the loop, and let the outer loop handle this
14109                          * escape which may or may not be a legitimate backref. */
14110                         goto loopdone;
14111                     case '1': case '2': case '3':case '4':
14112                     case '5': case '6': case '7':
14113                         /* When we parse backslash escapes there is ambiguity
14114                          * between backreferences and octal escapes. Any escape
14115                          * from \1 - \9 is a backreference, any multi-digit
14116                          * escape which does not start with 0 and which when
14117                          * evaluated as decimal could refer to an already
14118                          * parsed capture buffer is a back reference. Anything
14119                          * else is octal.
14120                          *
14121                          * Note this implies that \118 could be interpreted as
14122                          * 118 OR as "\11" . "8" depending on whether there
14123                          * were 118 capture buffers defined already in the
14124                          * pattern.  */
14125
14126                         /* NOTE, RExC_npar is 1 more than the actual number of
14127                          * parens we have seen so far, hence the "<" as opposed
14128                          * to "<=" */
14129                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14130                         {  /* Not to be treated as an octal constant, go
14131                                    find backref */
14132                             --p;
14133                             goto loopdone;
14134                         }
14135                         /* FALLTHROUGH */
14136                     case '0':
14137                         {
14138                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14139                             STRLEN numlen = 3;
14140                             ender = grok_oct(p, &numlen, &flags, NULL);
14141                             p += numlen;
14142                             if (   isDIGIT(*p)  /* like \08, \178 */
14143                                 && ckWARN(WARN_REGEXP)
14144                                 && numlen < 3)
14145                             {
14146                                 reg_warn_non_literal_string(
14147                                          p + 1,
14148                                          form_short_octal_warning(p, numlen));
14149                             }
14150                         }
14151                         break;
14152                     case '\0':
14153                         if (p >= RExC_end)
14154                             FAIL("Trailing \\");
14155                         /* FALLTHROUGH */
14156                     default:
14157                         if (isALPHANUMERIC(*p)) {
14158                             /* An alpha followed by '{' is going to fail next
14159                              * iteration, so don't output this warning in that
14160                              * case */
14161                             if (! isALPHA(*p) || *(p + 1) != '{') {
14162                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14163                                                   " passed through", p);
14164                             }
14165                         }
14166                         goto normal_default;
14167                     } /* End of switch on '\' */
14168                     break;
14169                 case '{':
14170                     /* Trying to gain new uses for '{' without breaking too
14171                      * much existing code is hard.  The solution currently
14172                      * adopted is:
14173                      *  1)  If there is no ambiguity that a '{' should always
14174                      *      be taken literally, at the start of a construct, we
14175                      *      just do so.
14176                      *  2)  If the literal '{' conflicts with our desired use
14177                      *      of it as a metacharacter, we die.  The deprecation
14178                      *      cycles for this have come and gone.
14179                      *  3)  If there is ambiguity, we raise a simple warning.
14180                      *      This could happen, for example, if the user
14181                      *      intended it to introduce a quantifier, but slightly
14182                      *      misspelled the quantifier.  Without this warning,
14183                      *      the quantifier would silently be taken as a literal
14184                      *      string of characters instead of a meta construct */
14185                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14186                         if (      RExC_strict
14187                             || (  p > parse_start + 1
14188                                 && isALPHA_A(*(p - 1))
14189                                 && *(p - 2) == '\\')
14190                             || new_regcurly(p, RExC_end))
14191                         {
14192                             RExC_parse = p + 1;
14193                             vFAIL("Unescaped left brace in regex is "
14194                                   "illegal here");
14195                         }
14196                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14197                                          " passed through");
14198                     }
14199                     goto normal_default;
14200                 case '}':
14201                 case ']':
14202                     if (p > RExC_parse && RExC_strict) {
14203                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14204                     }
14205                     /*FALLTHROUGH*/
14206                 default:    /* A literal character */
14207                   normal_default:
14208                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14209                         STRLEN numlen;
14210                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14211                                                &numlen, UTF8_ALLOW_DEFAULT);
14212                         p += numlen;
14213                     }
14214                     else
14215                         ender = (U8) *p++;
14216                     break;
14217                 } /* End of switch on the literal */
14218
14219                 /* Here, have looked at the literal character, and <ender>
14220                  * contains its ordinal; <p> points to the character after it.
14221                  * */
14222
14223                 if (ender > 255) {
14224                     REQUIRE_UTF8(flagp);
14225                 }
14226
14227                 /* We need to check if the next non-ignored thing is a
14228                  * quantifier.  Move <p> to after anything that should be
14229                  * ignored, which, as a side effect, positions <p> for the next
14230                  * loop iteration */
14231                 skip_to_be_ignored_text(pRExC_state, &p,
14232                                         FALSE /* Don't force to /x */ );
14233
14234                 /* If the next thing is a quantifier, it applies to this
14235                  * character only, which means that this character has to be in
14236                  * its own node and can't just be appended to the string in an
14237                  * existing node, so if there are already other characters in
14238                  * the node, close the node with just them, and set up to do
14239                  * this character again next time through, when it will be the
14240                  * only thing in its new node */
14241
14242                 next_is_quantifier =    LIKELY(p < RExC_end)
14243                                      && UNLIKELY(ISMULT2(p));
14244
14245                 if (next_is_quantifier && LIKELY(len)) {
14246                     p = oldp;
14247                     goto loopdone;
14248                 }
14249
14250                 /* Ready to add 'ender' to the node */
14251
14252                 if (! FOLD) {  /* The simple case, just append the literal */
14253                   not_fold_common:
14254
14255                     /* Don't output if it would overflow */
14256                     if (UNLIKELY(len > max_string_len - ((UTF)
14257                                                       ? UVCHR_SKIP(ender)
14258                                                       : 1)))
14259                     {
14260                         overflowed = TRUE;
14261                         break;
14262                     }
14263
14264                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14265                         *(s++) = (char) ender;
14266                     }
14267                     else {
14268                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14269                         added_len = (char *) new_s - s;
14270                         s = (char *) new_s;
14271
14272                         if (ender > 255)  {
14273                             requires_utf8_target = TRUE;
14274                         }
14275                     }
14276                 }
14277                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14278
14279                     /* Here are folding under /l, and the code point is
14280                      * problematic.  If this is the first character in the
14281                      * node, change the node type to folding.   Otherwise, if
14282                      * this is the first problematic character, close up the
14283                      * existing node, so can start a new node with this one */
14284                     if (! len) {
14285                         node_type = EXACTFL;
14286                         RExC_contains_locale = 1;
14287                     }
14288                     else if (node_type == EXACT) {
14289                         p = oldp;
14290                         goto loopdone;
14291                     }
14292
14293                     /* This problematic code point means we can't simplify
14294                      * things */
14295                     maybe_exactfu = FALSE;
14296
14297                     /* Here, we are adding a problematic fold character.
14298                      * "Problematic" in this context means that its fold isn't
14299                      * known until runtime.  (The non-problematic code points
14300                      * are the above-Latin1 ones that fold to also all
14301                      * above-Latin1.  Their folds don't vary no matter what the
14302                      * locale is.) But here we have characters whose fold
14303                      * depends on the locale.  We just add in the unfolded
14304                      * character, and wait until runtime to fold it */
14305                     goto not_fold_common;
14306                 }
14307                 else /* regular fold; see if actually is in a fold */
14308                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14309                          || (ender > 255
14310                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14311                 {
14312                     /* Here, folding, but the character isn't in a fold.
14313                      *
14314                      * Start a new node if previous characters in the node were
14315                      * folded */
14316                     if (len && node_type != EXACT) {
14317                         p = oldp;
14318                         goto loopdone;
14319                     }
14320
14321                     /* Here, continuing a node with non-folded characters.  Add
14322                      * this one */
14323                     goto not_fold_common;
14324                 }
14325                 else {  /* Here, does participate in some fold */
14326
14327                     /* If this is the first character in the node, change its
14328                      * type to folding.  Otherwise, if this is the first
14329                      * folding character in the node, close up the existing
14330                      * node, so can start a new node with this one.  */
14331                     if (! len) {
14332                         node_type = compute_EXACTish(pRExC_state);
14333                     }
14334                     else if (node_type == EXACT) {
14335                         p = oldp;
14336                         goto loopdone;
14337                     }
14338
14339                     if (UTF) {  /* Alway use the folded value for UTF-8
14340                                    patterns */
14341                         if (UVCHR_IS_INVARIANT(ender)) {
14342                             if (UNLIKELY(len + 1 > max_string_len)) {
14343                                 overflowed = TRUE;
14344                                 break;
14345                             }
14346
14347                             *(s)++ = (U8) toFOLD(ender);
14348                         }
14349                         else {
14350                             UV folded = _to_uni_fold_flags(
14351                                     ender,
14352                                     (U8 *) s,  /* We have allocated extra space
14353                                                   in 's' so can't run off the
14354                                                   end */
14355                                     &added_len,
14356                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14357                                                     ? FOLD_FLAGS_NOMIX_ASCII
14358                                                     : 0));
14359                             if (UNLIKELY(len + added_len > max_string_len)) {
14360                                 overflowed = TRUE;
14361                                 break;
14362                             }
14363
14364                             s += added_len;
14365
14366                             if (   folded > 255
14367                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14368                             {
14369                                 /* U+B5 folds to the MU, so its possible for a
14370                                  * non-UTF-8 target to match it */
14371                                 requires_utf8_target = TRUE;
14372                             }
14373                         }
14374                     }
14375                     else { /* Here is non-UTF8. */
14376
14377                         /* The fold will be one or (rarely) two characters.
14378                          * Check that there's room for at least a single one
14379                          * before setting any flags, etc.  Because otherwise an
14380                          * overflowing character could cause a flag to be set
14381                          * even though it doesn't end up in this node.  (For
14382                          * the two character fold, we check again, before
14383                          * setting any flags) */
14384                         if (UNLIKELY(len + 1 > max_string_len)) {
14385                             overflowed = TRUE;
14386                             break;
14387                         }
14388
14389 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14390    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14391                                       || UNICODE_DOT_DOT_VERSION > 0)
14392
14393                         /* On non-ancient Unicodes, check for the only possible
14394                          * multi-char fold  */
14395                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14396
14397                             /* This potential multi-char fold means the node
14398                              * can't be simple (because it could match more
14399                              * than a single char).  And in some cases it will
14400                              * match 'ss', so set that flag */
14401                             maybe_SIMPLE = 0;
14402                             has_ss = TRUE;
14403
14404                             /* It can't change to be an EXACTFU (unless already
14405                              * is one).  We fold it iff under /u rules. */
14406                             if (node_type != EXACTFU) {
14407                                 maybe_exactfu = FALSE;
14408                             }
14409                             else {
14410                                 if (UNLIKELY(len + 2 > max_string_len)) {
14411                                     overflowed = TRUE;
14412                                     break;
14413                                 }
14414
14415                                 *(s++) = 's';
14416                                 *(s++) = 's';
14417                                 added_len = 2;
14418
14419                                 goto done_with_this_char;
14420                             }
14421                         }
14422                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14423                                  && LIKELY(len > 0)
14424                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14425                         {
14426                             /* Also, the sequence 'ss' is special when not
14427                              * under /u.  If the target string is UTF-8, it
14428                              * should match SHARP S; otherwise it won't.  So,
14429                              * here we have to exclude the possibility of this
14430                              * node moving to /u.*/
14431                             has_ss = TRUE;
14432                             maybe_exactfu = FALSE;
14433                         }
14434 #endif
14435                         /* Here, the fold will be a single character */
14436
14437                         if (UNLIKELY(ender == MICRO_SIGN)) {
14438                             has_micro_sign = TRUE;
14439                         }
14440                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14441
14442                             /* If the character's fold differs between /d and
14443                              * /u, this can't change to be an EXACTFU node */
14444                             maybe_exactfu = FALSE;
14445                         }
14446
14447                         *(s++) = (DEPENDS_SEMANTICS)
14448                                  ? (char) toFOLD(ender)
14449
14450                                    /* Under /u, the fold of any character in
14451                                     * the 0-255 range happens to be its
14452                                     * lowercase equivalent, except for LATIN
14453                                     * SMALL LETTER SHARP S, which was handled
14454                                     * above, and the MICRO SIGN, whose fold
14455                                     * requires UTF-8 to represent.  */
14456                                  : (char) toLOWER_L1(ender);
14457                     }
14458                 } /* End of adding current character to the node */
14459
14460               done_with_this_char:
14461
14462                 len += added_len;
14463
14464                 if (next_is_quantifier) {
14465
14466                     /* Here, the next input is a quantifier, and to get here,
14467                      * the current character is the only one in the node. */
14468                     goto loopdone;
14469                 }
14470
14471             } /* End of loop through literal characters */
14472
14473             /* Here we have either exhausted the input or run out of room in
14474              * the node.  If the former, we are done.  (If we encountered a
14475              * character that can't be in the node, transfer is made directly
14476              * to <loopdone>, and so we wouldn't have fallen off the end of the
14477              * loop.)  */
14478             if (LIKELY(! overflowed)) {
14479                 goto loopdone;
14480             }
14481
14482             /* Here we have run out of room.  We can grow plain EXACT and
14483              * LEXACT nodes.  If the pattern is gigantic enough, though,
14484              * eventually we'll have to artificially chunk the pattern into
14485              * multiple nodes. */
14486             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14487                 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14488                 Size_t overhead_expansion = 0;
14489                 char temp[256];
14490                 Size_t max_nodes_for_string;
14491                 Size_t achievable;
14492                 SSize_t delta;
14493
14494                 /* Here we couldn't fit the final character in the current
14495                  * node, so it will have to be reparsed, no matter what else we
14496                  * do */
14497                 p = oldp;
14498
14499                 /* If would have overflowed a regular EXACT node, switch
14500                  * instead to an LEXACT.  The code below is structured so that
14501                  * the actual growing code is common to changing from an EXACT
14502                  * or just increasing the LEXACT size.  This means that we have
14503                  * to save the string in the EXACT case before growing, and
14504                  * then copy it afterwards to its new location */
14505                 if (node_type == EXACT) {
14506                     overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14507                     RExC_emit += overhead_expansion;
14508                     Copy(s0, temp, len, char);
14509                 }
14510
14511                 /* Ready to grow.  If it was a plain EXACT, the string was
14512                  * saved, and the first few bytes of it overwritten by adding
14513                  * an argument field.  We assume, as we do elsewhere in this
14514                  * file, that one byte of remaining input will translate into
14515                  * one byte of output, and if that's too small, we grow again,
14516                  * if too large the excess memory is freed at the end */
14517
14518                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14519                 achievable = MIN(max_nodes_for_string,
14520                                  current_string_nodes + STR_SZ(RExC_end - p));
14521                 delta = achievable - current_string_nodes;
14522
14523                 /* If there is just no more room, go finish up this chunk of
14524                  * the pattern. */
14525                 if (delta <= 0) {
14526                     goto loopdone;
14527                 }
14528
14529                 change_engine_size(pRExC_state, delta + overhead_expansion);
14530                 current_string_nodes += delta;
14531                 max_string_len
14532                            = sizeof(struct regnode) * current_string_nodes;
14533                 upper_fill = max_string_len + 1;
14534
14535                 /* If the length was small, we know this was originally an
14536                  * EXACT node now converted to LEXACT, and the string has to be
14537                  * restored.  Otherwise the string was untouched.  260 is just
14538                  * a number safely above 255 so don't have to worry about
14539                  * getting it precise */
14540                 if (len < 260) {
14541                     node_type = LEXACT;
14542                     FILL_NODE(ret, node_type);
14543                     s0 = STRING(REGNODE_p(ret));
14544                     Copy(temp, s0, len, char);
14545                     s = s0 + len;
14546                 }
14547
14548                 goto continue_parse;
14549             }
14550             else if (FOLD) {
14551                 bool splittable = FALSE;
14552                 bool backed_up = FALSE;
14553                 char * e;
14554                 char * s_start;
14555
14556                 /* Here is /i.  Running out of room creates a problem if we are
14557                  * folding, and the split happens in the middle of a
14558                  * multi-character fold, as a match that should have occurred,
14559                  * won't, due to the way nodes are matched, and our artificial
14560                  * boundary.  So back off until we aren't splitting such a
14561                  * fold.  If there is no such place to back off to, we end up
14562                  * taking the entire node as-is.  This can happen if the node
14563                  * consists entirely of 'f' or entirely of 's' characters (or
14564                  * things that fold to them) as 'ff' and 'ss' are
14565                  * multi-character folds.
14566                  *
14567                  * The Unicode standard says that multi character folds consist
14568                  * of either two or three characters.  That means we would be
14569                  * splitting one if the final character in the node is at the
14570                  * beginning of either type, or is the second of a three
14571                  * character fold.
14572                  *
14573                  * At this point:
14574                  *  ender     is the code point of the character that won't fit
14575                  *            in the node
14576                  *  s         points to just beyond the final byte in the node.
14577                  *            It's where we would place ender if there were
14578                  *            room, and where in fact we do place ender's fold
14579                  *            in the code below, as we've over-allocated space
14580                  *            for s0 (hence s) to allow for this
14581                  *  e         starts at 's' and advances as we append things.
14582                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
14583                  *            have been advanced to beyond it).
14584                  *  old_old_s points to the beginning byte of the final
14585                  *            character in the node
14586                  *  p         points to the beginning byte in the input of the
14587                  *            character beyond 'ender'.
14588                  *  oldp      points to the beginning byte in the input of
14589                  *            'ender'.
14590                  *
14591                  * In the case of /il, we haven't folded anything that could be
14592                  * affected by the locale.  That means only above-Latin1
14593                  * characters that fold to other above-latin1 characters get
14594                  * folded at compile time.  To check where a good place to
14595                  * split nodes is, everything in it will have to be folded.
14596                  * The boolean 'maybe_exactfu' keeps track in /il if there are
14597                  * any unfolded characters in the node. */
14598                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14599
14600                 /* If we do need to fold the node, we need a place to store the
14601                  * folded copy, and a way to map back to the unfolded original
14602                  * */
14603                 char * locfold_buf = NULL;
14604                 Size_t * loc_correspondence = NULL;
14605
14606                 if (! need_to_fold_loc) {   /* The normal case.  Just
14607                                                initialize to the actual node */
14608                     e = s;
14609                     s_start = s0;
14610                     s = old_old_s;  /* Point to the beginning of the final char
14611                                        that fits in the node */
14612                 }
14613                 else {
14614
14615                     /* Here, we have filled a /il node, and there are unfolded
14616                      * characters in it.  If the runtime locale turns out to be
14617                      * UTF-8, there are possible multi-character folds, just
14618                      * like when not under /l.  The node hence can't terminate
14619                      * in the middle of such a fold.  To determine this, we
14620                      * have to create a folded copy of this node.  That means
14621                      * reparsing the node, folding everything assuming a UTF-8
14622                      * locale.  (If at runtime it isn't such a locale, the
14623                      * actions here wouldn't have been necessary, but we have
14624                      * to assume the worst case.)  If we find we need to back
14625                      * off the folded string, we do so, and then map that
14626                      * position back to the original unfolded node, which then
14627                      * gets output, truncated at that spot */
14628
14629                     char * redo_p = RExC_parse;
14630                     char * redo_e;
14631                     char * old_redo_e;
14632
14633                     /* Allow enough space assuming a single byte input folds to
14634                      * a single byte output, plus assume that the two unparsed
14635                      * characters (that we may need) fold to the largest number
14636                      * of bytes possible, plus extra for one more worst case
14637                      * scenario.  In the loop below, if we start eating into
14638                      * that final spare space, we enlarge this initial space */
14639                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14640
14641                     Newxz(locfold_buf, size, char);
14642                     Newxz(loc_correspondence, size, Size_t);
14643
14644                     /* Redo this node's parse, folding into 'locfold_buf' */
14645                     redo_p = RExC_parse;
14646                     old_redo_e = redo_e = locfold_buf;
14647                     while (redo_p <= oldp) {
14648
14649                         old_redo_e = redo_e;
14650                         loc_correspondence[redo_e - locfold_buf]
14651                                                         = redo_p - RExC_parse;
14652
14653                         if (UTF) {
14654                             Size_t added_len;
14655
14656                             (void) _to_utf8_fold_flags((U8 *) redo_p,
14657                                                        (U8 *) RExC_end,
14658                                                        (U8 *) redo_e,
14659                                                        &added_len,
14660                                                        FOLD_FLAGS_FULL);
14661                             redo_e += added_len;
14662                             redo_p += UTF8SKIP(redo_p);
14663                         }
14664                         else {
14665
14666                             /* Note that if this code is run on some ancient
14667                              * Unicode versions, SHARP S doesn't fold to 'ss',
14668                              * but rather than clutter the code with #ifdef's,
14669                              * as is done above, we ignore that possibility.
14670                              * This is ok because this code doesn't affect what
14671                              * gets matched, but merely where the node gets
14672                              * split */
14673                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14674                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14675                             }
14676                             else {
14677                                 *redo_e++ = 's';
14678                                 *redo_e++ = 's';
14679                             }
14680                             redo_p++;
14681                         }
14682
14683
14684                         /* If we're getting so close to the end that a
14685                          * worst-case fold in the next character would cause us
14686                          * to overflow, increase, assuming one byte output byte
14687                          * per one byte input one, plus room for another worst
14688                          * case fold */
14689                         if (   redo_p <= oldp
14690                             && redo_e > locfold_buf + size
14691                                                     - (UTF8_MAXBYTES_CASE + 1))
14692                         {
14693                             Size_t new_size = size
14694                                             + (oldp - redo_p)
14695                                             + UTF8_MAXBYTES_CASE + 1;
14696                             Ptrdiff_t e_offset = redo_e - locfold_buf;
14697
14698                             Renew(locfold_buf, new_size, char);
14699                             Renew(loc_correspondence, new_size, Size_t);
14700                             size = new_size;
14701
14702                             redo_e = locfold_buf + e_offset;
14703                         }
14704                     }
14705
14706                     /* Set so that things are in terms of the folded, temporary
14707                      * string */
14708                     s = old_redo_e;
14709                     s_start = locfold_buf;
14710                     e = redo_e;
14711
14712                 }
14713
14714                 /* Here, we have 's', 's_start' and 'e' set up to point to the
14715                  * input that goes into the node, folded.
14716                  *
14717                  * If the final character of the node and the fold of ender
14718                  * form the first two characters of a three character fold, we
14719                  * need to peek ahead at the next (unparsed) character in the
14720                  * input to determine if the three actually do form such a
14721                  * fold.  Just looking at that character is not generally
14722                  * sufficient, as it could be, for example, an escape sequence
14723                  * that evaluates to something else, and it needs to be folded.
14724                  *
14725                  * khw originally thought to just go through the parse loop one
14726                  * extra time, but that doesn't work easily as that iteration
14727                  * could cause things to think that the parse is over and to
14728                  * goto loopdone.  The character could be a '$' for example, or
14729                  * the character beyond could be a quantifier, and other
14730                  * glitches as well.
14731                  *
14732                  * The solution used here for peeking ahead is to look at that
14733                  * next character.  If it isn't ASCII punctuation, then it will
14734                  * be something that continues in an EXACTish node if there
14735                  * were space.  We append the fold of it to s, having reserved
14736                  * enough room in s0 for the purpose.  If we can't reasonably
14737                  * peek ahead, we instead assume the worst case: that it is
14738                  * something that would form the completion of a multi-char
14739                  * fold.
14740                  *
14741                  * If we can't split between s and ender, we work backwards
14742                  * character-by-character down to s0.  At each current point
14743                  * see if we are at the beginning of a multi-char fold.  If so,
14744                  * that means we would be splitting the fold across nodes, and
14745                  * so we back up one and try again.
14746                  *
14747                  * If we're not at the beginning, we still could be at the
14748                  * final two characters of a (rare) three character fold.  We
14749                  * check if the sequence starting at the character before the
14750                  * current position (and including the current and next
14751                  * characters) is a three character fold.  If not, the node can
14752                  * be split here.  If it is, we have to backup two characters
14753                  * and try again.
14754                  *
14755                  * Otherwise, the node can be split at the current position.
14756                  *
14757                  * The same logic is used for UTF-8 patterns and not */
14758                 if (UTF) {
14759                     Size_t added_len;
14760
14761                     /* Append the fold of ender */
14762                     (void) _to_uni_fold_flags(
14763                         ender,
14764                         (U8 *) e,
14765                         &added_len,
14766                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14767                                         ? FOLD_FLAGS_NOMIX_ASCII
14768                                         : 0));
14769                     e += added_len;
14770
14771                     /* 's' and the character folded to by ender may be the
14772                      * first two of a three-character fold, in which case the
14773                      * node should not be split here.  That may mean examining
14774                      * the so-far unparsed character starting at 'p'.  But if
14775                      * ender folded to more than one character, we already have
14776                      * three characters to look at.  Also, we first check if
14777                      * the sequence consisting of s and the next character form
14778                      * the first two of some three character fold.  If not,
14779                      * there's no need to peek ahead. */
14780                     if (   added_len <= UTF8SKIP(e - added_len)
14781                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
14782                     {
14783                         /* Here, the two do form the beginning of a potential
14784                          * three character fold.  The unexamined character may
14785                          * or may not complete it.  Peek at it.  It might be
14786                          * something that ends the node or an escape sequence,
14787                          * in which case we don't know without a lot of work
14788                          * what it evaluates to, so we have to assume the worst
14789                          * case: that it does complete the fold, and so we
14790                          * can't split here.  All such instances  will have
14791                          * that character be an ASCII punctuation character,
14792                          * like a backslash.  So, for that case, backup one and
14793                          * drop down to try at that position */
14794                         if (isPUNCT(*p)) {
14795                             s = (char *) utf8_hop_back((U8 *) s, -1,
14796                                        (U8 *) s_start);
14797                             backed_up = TRUE;
14798                         }
14799                         else {
14800                             /* Here, since it's not punctuation, it must be a
14801                              * real character, and we can append its fold to
14802                              * 'e' (having deliberately reserved enough space
14803                              * for this eventuality) and drop down to check if
14804                              * the three actually do form a folded sequence */
14805                             (void) _to_utf8_fold_flags(
14806                                 (U8 *) p, (U8 *) RExC_end,
14807                                 (U8 *) e,
14808                                 &added_len,
14809                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14810                                                 ? FOLD_FLAGS_NOMIX_ASCII
14811                                                 : 0));
14812                             e += added_len;
14813                         }
14814                     }
14815
14816                     /* Here, we either have three characters available in
14817                      * sequence starting at 's', or we have two characters and
14818                      * know that the following one can't possibly be part of a
14819                      * three character fold.  We go through the node backwards
14820                      * until we find a place where we can split it without
14821                      * breaking apart a multi-character fold.  At any given
14822                      * point we have to worry about if such a fold begins at
14823                      * the current 's', and also if a three-character fold
14824                      * begins at s-1, (containing s and s+1).  Splitting in
14825                      * either case would break apart a fold */
14826                     do {
14827                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
14828                                                             (U8 *) s_start);
14829
14830                         /* If is a multi-char fold, can't split here.  Backup
14831                          * one char and try again */
14832                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
14833                             s = prev_s;
14834                             backed_up = TRUE;
14835                             continue;
14836                         }
14837
14838                         /* If the two characters beginning at 's' are part of a
14839                          * three character fold starting at the character
14840                          * before s, we can't split either before or after s.
14841                          * Backup two chars and try again */
14842                         if (   LIKELY(s > s_start)
14843                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
14844                         {
14845                             s = prev_s;
14846                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
14847                             backed_up = TRUE;
14848                             continue;
14849                         }
14850
14851                         /* Here there's no multi-char fold between s and the
14852                          * next character following it.  We can split */
14853                         splittable = TRUE;
14854                         break;
14855
14856                     } while (s > s_start); /* End of loops backing up through the node */
14857
14858                     /* Here we either couldn't find a place to split the node,
14859                      * or else we broke out of the loop setting 'splittable' to
14860                      * true.  In the latter case, the place to split is between
14861                      * the first and second characters in the sequence starting
14862                      * at 's' */
14863                     if (splittable) {
14864                         s += UTF8SKIP(s);
14865                     }
14866                 }
14867                 else {  /* Pattern not UTF-8 */
14868                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
14869                         || ASCII_FOLD_RESTRICTED)
14870                     {
14871                         *e++ = toLOWER_L1(ender);
14872                     }
14873                     else {
14874                         *e++ = 's';
14875                         *e++ = 's';
14876                     }
14877
14878                     if (   e - s  <= 1
14879                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
14880                     {
14881                         if (isPUNCT(*p)) {
14882                             s--;
14883                             backed_up = TRUE;
14884                         }
14885                         else {
14886                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
14887                                 || ASCII_FOLD_RESTRICTED)
14888                             {
14889                                 *e++ = toLOWER_L1(ender);
14890                             }
14891                             else {
14892                                 *e++ = 's';
14893                                 *e++ = 's';
14894                             }
14895                         }
14896                     }
14897
14898                     do {
14899                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
14900                             s--;
14901                             backed_up = TRUE;
14902                             continue;
14903                         }
14904
14905                         if (   LIKELY(s > s_start)
14906                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
14907                         {
14908                             s -= 2;
14909                             backed_up = TRUE;
14910                             continue;
14911                         }
14912
14913                         splittable = TRUE;
14914                         break;
14915
14916                     } while (s > s_start);
14917
14918                     if (splittable) {
14919                         s++;
14920                     }
14921                 }
14922
14923                 /* Here, we are done backing up.  If we didn't backup at all
14924                  * (the likely case), just proceed */
14925                 if (backed_up) {
14926
14927                    /* If we did find a place to split, reparse the entire node
14928                     * stopping where we have calculated. */
14929                     if (splittable) {
14930
14931                        /* If we created a temporary folded string under /l, we
14932                         * have to map that back to the original */
14933                         if (need_to_fold_loc) {
14934                             upper_fill = loc_correspondence[s - s_start];
14935                             Safefree(locfold_buf);
14936                             Safefree(loc_correspondence);
14937
14938                             if (upper_fill == 0) {
14939                                 FAIL2("panic: loc_correspondence[%d] is 0",
14940                                       (int) (s - s_start));
14941                             }
14942                         }
14943                         else {
14944                             upper_fill = s - s0;
14945                         }
14946                         goto reparse;
14947                     }
14948                     else if (need_to_fold_loc) {
14949                         Safefree(locfold_buf);
14950                         Safefree(loc_correspondence);
14951                     }
14952
14953                     /* Here the node consists entirely of non-final multi-char
14954                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
14955                      * decent place to split it, so give up and just take the
14956                      * whole thing */
14957                     len = old_s - s0;
14958                 }
14959             }   /* End of verifying node ends with an appropriate char */
14960
14961             /* We need to start the next node at the character that didn't fit
14962              * in this one */
14963             p = oldp;
14964
14965           loopdone:   /* Jumped to when encounters something that shouldn't be
14966                          in the node */
14967
14968             /* Free up any over-allocated space; cast is to silence bogus
14969              * warning in MS VC */
14970             change_engine_size(pRExC_state,
14971                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
14972
14973             /* I (khw) don't know if you can get here with zero length, but the
14974              * old code handled this situation by creating a zero-length EXACT
14975              * node.  Might as well be NOTHING instead */
14976             if (len == 0) {
14977                 OP(REGNODE_p(ret)) = NOTHING;
14978             }
14979             else {
14980
14981                 /* If the node type is EXACT here, check to see if it
14982                  * should be EXACTL, or EXACT_REQ8. */
14983                 if (node_type == EXACT) {
14984                     if (LOC) {
14985                         node_type = EXACTL;
14986                     }
14987                     else if (requires_utf8_target) {
14988                         node_type = EXACT_REQ8;
14989                     }
14990                 }
14991                 else if (node_type == LEXACT) {
14992                     if (requires_utf8_target) {
14993                         node_type = LEXACT_REQ8;
14994                     }
14995                 }
14996                 else if (FOLD) {
14997                     if (    UNLIKELY(has_micro_sign || has_ss)
14998                         && (node_type == EXACTFU || (   node_type == EXACTF
14999                                                      && maybe_exactfu)))
15000                     {   /* These two conditions are problematic in non-UTF-8
15001                            EXACTFU nodes. */
15002                         assert(! UTF);
15003                         node_type = EXACTFUP;
15004                     }
15005                     else if (node_type == EXACTFL) {
15006
15007                         /* 'maybe_exactfu' is deliberately set above to
15008                          * indicate this node type, where all code points in it
15009                          * are above 255 */
15010                         if (maybe_exactfu) {
15011                             node_type = EXACTFLU8;
15012                         }
15013                         else if (UNLIKELY(
15014                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15015                         {
15016                             /* A character that folds to more than one will
15017                              * match multiple characters, so can't be SIMPLE.
15018                              * We don't have to worry about this with EXACTFLU8
15019                              * nodes just above, as they have already been
15020                              * folded (since the fold doesn't vary at run
15021                              * time).  Here, if the final character in the node
15022                              * folds to multiple, it can't be simple.  (This
15023                              * only has an effect if the node has only a single
15024                              * character, hence the final one, as elsewhere we
15025                              * turn off simple for nodes whose length > 1 */
15026                             maybe_SIMPLE = 0;
15027                         }
15028                     }
15029                     else if (node_type == EXACTF) {  /* Means is /di */
15030
15031                         /* This intermediate variable is needed solely because
15032                          * the asserts in the macro where used exceed Win32's
15033                          * literal string capacity */
15034                         char first_char = * STRING(REGNODE_p(ret));
15035
15036                         /* If 'maybe_exactfu' is clear, then we need to stay
15037                          * /di.  If it is set, it means there are no code
15038                          * points that match differently depending on UTF8ness
15039                          * of the target string, so it can become an EXACTFU
15040                          * node */
15041                         if (! maybe_exactfu) {
15042                             RExC_seen_d_op = TRUE;
15043                         }
15044                         else if (   isALPHA_FOLD_EQ(first_char, 's')
15045                                  || isALPHA_FOLD_EQ(ender, 's'))
15046                         {
15047                             /* But, if the node begins or ends in an 's' we
15048                              * have to defer changing it into an EXACTFU, as
15049                              * the node could later get joined with another one
15050                              * that ends or begins with 's' creating an 'ss'
15051                              * sequence which would then wrongly match the
15052                              * sharp s without the target being UTF-8.  We
15053                              * create a special node that we resolve later when
15054                              * we join nodes together */
15055
15056                             node_type = EXACTFU_S_EDGE;
15057                         }
15058                         else {
15059                             node_type = EXACTFU;
15060                         }
15061                     }
15062
15063                     if (requires_utf8_target && node_type == EXACTFU) {
15064                         node_type = EXACTFU_REQ8;
15065                     }
15066                 }
15067
15068                 OP(REGNODE_p(ret)) = node_type;
15069                 setSTR_LEN(REGNODE_p(ret), len);
15070                 RExC_emit += STR_SZ(len);
15071
15072                 /* If the node isn't a single character, it can't be SIMPLE */
15073                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15074                     maybe_SIMPLE = 0;
15075                 }
15076
15077                 *flagp |= HASWIDTH | maybe_SIMPLE;
15078             }
15079
15080             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15081             RExC_parse = p;
15082
15083             {
15084                 /* len is STRLEN which is unsigned, need to copy to signed */
15085                 IV iv = len;
15086                 if (iv < 0)
15087                     vFAIL("Internal disaster");
15088             }
15089
15090         } /* End of label 'defchar:' */
15091         break;
15092     } /* End of giant switch on input character */
15093
15094     /* Position parse to next real character */
15095     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15096                                             FALSE /* Don't force to /x */ );
15097     if (   *RExC_parse == '{'
15098         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15099     {
15100         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15101             RExC_parse++;
15102             vFAIL("Unescaped left brace in regex is illegal here");
15103         }
15104         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15105                                   " passed through");
15106     }
15107
15108     return(ret);
15109 }
15110
15111
15112 STATIC void
15113 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15114 {
15115     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
15116      * sets up the bitmap and any flags, removing those code points from the
15117      * inversion list, setting it to NULL should it become completely empty */
15118
15119     dVAR;
15120
15121     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15122     assert(PL_regkind[OP(node)] == ANYOF);
15123
15124     /* There is no bitmap for this node type */
15125     if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15126         return;
15127     }
15128
15129     ANYOF_BITMAP_ZERO(node);
15130     if (*invlist_ptr) {
15131
15132         /* This gets set if we actually need to modify things */
15133         bool change_invlist = FALSE;
15134
15135         UV start, end;
15136
15137         /* Start looking through *invlist_ptr */
15138         invlist_iterinit(*invlist_ptr);
15139         while (invlist_iternext(*invlist_ptr, &start, &end)) {
15140             UV high;
15141             int i;
15142
15143             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15144                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15145             }
15146
15147             /* Quit if are above what we should change */
15148             if (start >= NUM_ANYOF_CODE_POINTS) {
15149                 break;
15150             }
15151
15152             change_invlist = TRUE;
15153
15154             /* Set all the bits in the range, up to the max that we are doing */
15155             high = (end < NUM_ANYOF_CODE_POINTS - 1)
15156                    ? end
15157                    : NUM_ANYOF_CODE_POINTS - 1;
15158             for (i = start; i <= (int) high; i++) {
15159                 if (! ANYOF_BITMAP_TEST(node, i)) {
15160                     ANYOF_BITMAP_SET(node, i);
15161                 }
15162             }
15163         }
15164         invlist_iterfinish(*invlist_ptr);
15165
15166         /* Done with loop; remove any code points that are in the bitmap from
15167          * *invlist_ptr; similarly for code points above the bitmap if we have
15168          * a flag to match all of them anyways */
15169         if (change_invlist) {
15170             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15171         }
15172         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15173             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15174         }
15175
15176         /* If have completely emptied it, remove it completely */
15177         if (_invlist_len(*invlist_ptr) == 0) {
15178             SvREFCNT_dec_NN(*invlist_ptr);
15179             *invlist_ptr = NULL;
15180         }
15181     }
15182 }
15183
15184 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15185    Character classes ([:foo:]) can also be negated ([:^foo:]).
15186    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15187    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15188    but trigger failures because they are currently unimplemented. */
15189
15190 #define POSIXCC_DONE(c)   ((c) == ':')
15191 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15192 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15193 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15194
15195 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
15196 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
15197 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
15198
15199 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15200
15201 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15202  * routine. q.v. */
15203 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
15204         if (posix_warnings) {                                               \
15205             if (! RExC_warn_text ) RExC_warn_text =                         \
15206                                          (AV *) sv_2mortal((SV *) newAV()); \
15207             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
15208                                              WARNING_PREFIX                 \
15209                                              text                           \
15210                                              REPORT_LOCATION,               \
15211                                              REPORT_LOCATION_ARGS(p)));     \
15212         }                                                                   \
15213     } STMT_END
15214 #define CLEAR_POSIX_WARNINGS()                                              \
15215     STMT_START {                                                            \
15216         if (posix_warnings && RExC_warn_text)                               \
15217             av_clear(RExC_warn_text);                                       \
15218     } STMT_END
15219
15220 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15221     STMT_START {                                                            \
15222         CLEAR_POSIX_WARNINGS();                                             \
15223         return ret;                                                         \
15224     } STMT_END
15225
15226 STATIC int
15227 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15228
15229     const char * const s,      /* Where the putative posix class begins.
15230                                   Normally, this is one past the '['.  This
15231                                   parameter exists so it can be somewhere
15232                                   besides RExC_parse. */
15233     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15234                                   NULL */
15235     AV ** posix_warnings,      /* Where to place any generated warnings, or
15236                                   NULL */
15237     const bool check_only      /* Don't die if error */
15238 )
15239 {
15240     /* This parses what the caller thinks may be one of the three POSIX
15241      * constructs:
15242      *  1) a character class, like [:blank:]
15243      *  2) a collating symbol, like [. .]
15244      *  3) an equivalence class, like [= =]
15245      * In the latter two cases, it croaks if it finds a syntactically legal
15246      * one, as these are not handled by Perl.
15247      *
15248      * The main purpose is to look for a POSIX character class.  It returns:
15249      *  a) the class number
15250      *      if it is a completely syntactically and semantically legal class.
15251      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15252      *      closing ']' of the class
15253      *  b) OOB_NAMEDCLASS
15254      *      if it appears that one of the three POSIX constructs was meant, but
15255      *      its specification was somehow defective.  'updated_parse_ptr', if
15256      *      not NULL, is set to point to the character just after the end
15257      *      character of the class.  See below for handling of warnings.
15258      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15259      *      if it  doesn't appear that a POSIX construct was intended.
15260      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15261      *      raised.
15262      *
15263      * In b) there may be errors or warnings generated.  If 'check_only' is
15264      * TRUE, then any errors are discarded.  Warnings are returned to the
15265      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15266      * instead it is NULL, warnings are suppressed.
15267      *
15268      * The reason for this function, and its complexity is that a bracketed
15269      * character class can contain just about anything.  But it's easy to
15270      * mistype the very specific posix class syntax but yielding a valid
15271      * regular bracketed class, so it silently gets compiled into something
15272      * quite unintended.
15273      *
15274      * The solution adopted here maintains backward compatibility except that
15275      * it adds a warning if it looks like a posix class was intended but
15276      * improperly specified.  The warning is not raised unless what is input
15277      * very closely resembles one of the 14 legal posix classes.  To do this,
15278      * it uses fuzzy parsing.  It calculates how many single-character edits it
15279      * would take to transform what was input into a legal posix class.  Only
15280      * if that number is quite small does it think that the intention was a
15281      * posix class.  Obviously these are heuristics, and there will be cases
15282      * where it errs on one side or another, and they can be tweaked as
15283      * experience informs.
15284      *
15285      * The syntax for a legal posix class is:
15286      *
15287      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15288      *
15289      * What this routine considers syntactically to be an intended posix class
15290      * is this (the comments indicate some restrictions that the pattern
15291      * doesn't show):
15292      *
15293      *  qr/(?x: \[?                         # The left bracket, possibly
15294      *                                      # omitted
15295      *          \h*                         # possibly followed by blanks
15296      *          (?: \^ \h* )?               # possibly a misplaced caret
15297      *          [:;]?                       # The opening class character,
15298      *                                      # possibly omitted.  A typo
15299      *                                      # semi-colon can also be used.
15300      *          \h*
15301      *          \^?                         # possibly a correctly placed
15302      *                                      # caret, but not if there was also
15303      *                                      # a misplaced one
15304      *          \h*
15305      *          .{3,15}                     # The class name.  If there are
15306      *                                      # deviations from the legal syntax,
15307      *                                      # its edit distance must be close
15308      *                                      # to a real class name in order
15309      *                                      # for it to be considered to be
15310      *                                      # an intended posix class.
15311      *          \h*
15312      *          [[:punct:]]?                # The closing class character,
15313      *                                      # possibly omitted.  If not a colon
15314      *                                      # nor semi colon, the class name
15315      *                                      # must be even closer to a valid
15316      *                                      # one
15317      *          \h*
15318      *          \]?                         # The right bracket, possibly
15319      *                                      # omitted.
15320      *     )/
15321      *
15322      * In the above, \h must be ASCII-only.
15323      *
15324      * These are heuristics, and can be tweaked as field experience dictates.
15325      * There will be cases when someone didn't intend to specify a posix class
15326      * that this warns as being so.  The goal is to minimize these, while
15327      * maximizing the catching of things intended to be a posix class that
15328      * aren't parsed as such.
15329      */
15330
15331     const char* p             = s;
15332     const char * const e      = RExC_end;
15333     unsigned complement       = 0;      /* If to complement the class */
15334     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15335     bool has_opening_bracket  = FALSE;
15336     bool has_opening_colon    = FALSE;
15337     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15338                                                    valid class */
15339     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15340     const char* name_start;             /* ptr to class name first char */
15341
15342     /* If the number of single-character typos the input name is away from a
15343      * legal name is no more than this number, it is considered to have meant
15344      * the legal name */
15345     int max_distance          = 2;
15346
15347     /* to store the name.  The size determines the maximum length before we
15348      * decide that no posix class was intended.  Should be at least
15349      * sizeof("alphanumeric") */
15350     UV input_text[15];
15351     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15352
15353     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15354
15355     CLEAR_POSIX_WARNINGS();
15356
15357     if (p >= e) {
15358         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15359     }
15360
15361     if (*(p - 1) != '[') {
15362         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15363         found_problem = TRUE;
15364     }
15365     else {
15366         has_opening_bracket = TRUE;
15367     }
15368
15369     /* They could be confused and think you can put spaces between the
15370      * components */
15371     if (isBLANK(*p)) {
15372         found_problem = TRUE;
15373
15374         do {
15375             p++;
15376         } while (p < e && isBLANK(*p));
15377
15378         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15379     }
15380
15381     /* For [. .] and [= =].  These are quite different internally from [: :],
15382      * so they are handled separately.  */
15383     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15384                                             and 1 for at least one char in it
15385                                           */
15386     {
15387         const char open_char  = *p;
15388         const char * temp_ptr = p + 1;
15389
15390         /* These two constructs are not handled by perl, and if we find a
15391          * syntactically valid one, we croak.  khw, who wrote this code, finds
15392          * this explanation of them very unclear:
15393          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15394          * And searching the rest of the internet wasn't very helpful either.
15395          * It looks like just about any byte can be in these constructs,
15396          * depending on the locale.  But unless the pattern is being compiled
15397          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15398          * In that case, it looks like [= =] isn't allowed at all, and that
15399          * [. .] could be any single code point, but for longer strings the
15400          * constituent characters would have to be the ASCII alphabetics plus
15401          * the minus-hyphen.  Any sensible locale definition would limit itself
15402          * to these.  And any portable one definitely should.  Trying to parse
15403          * the general case is a nightmare (see [perl #127604]).  So, this code
15404          * looks only for interiors of these constructs that match:
15405          *      qr/.|[-\w]{2,}/
15406          * Using \w relaxes the apparent rules a little, without adding much
15407          * danger of mistaking something else for one of these constructs.
15408          *
15409          * [. .] in some implementations described on the internet is usable to
15410          * escape a character that otherwise is special in bracketed character
15411          * classes.  For example [.].] means a literal right bracket instead of
15412          * the ending of the class
15413          *
15414          * [= =] can legitimately contain a [. .] construct, but we don't
15415          * handle this case, as that [. .] construct will later get parsed
15416          * itself and croak then.  And [= =] is checked for even when not under
15417          * /l, as Perl has long done so.
15418          *
15419          * The code below relies on there being a trailing NUL, so it doesn't
15420          * have to keep checking if the parse ptr < e.
15421          */
15422         if (temp_ptr[1] == open_char) {
15423             temp_ptr++;
15424         }
15425         else while (    temp_ptr < e
15426                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15427         {
15428             temp_ptr++;
15429         }
15430
15431         if (*temp_ptr == open_char) {
15432             temp_ptr++;
15433             if (*temp_ptr == ']') {
15434                 temp_ptr++;
15435                 if (! found_problem && ! check_only) {
15436                     RExC_parse = (char *) temp_ptr;
15437                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15438                             "extensions", open_char, open_char);
15439                 }
15440
15441                 /* Here, the syntax wasn't completely valid, or else the call
15442                  * is to check-only */
15443                 if (updated_parse_ptr) {
15444                     *updated_parse_ptr = (char *) temp_ptr;
15445                 }
15446
15447                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15448             }
15449         }
15450
15451         /* If we find something that started out to look like one of these
15452          * constructs, but isn't, we continue below so that it can be checked
15453          * for being a class name with a typo of '.' or '=' instead of a colon.
15454          * */
15455     }
15456
15457     /* Here, we think there is a possibility that a [: :] class was meant, and
15458      * we have the first real character.  It could be they think the '^' comes
15459      * first */
15460     if (*p == '^') {
15461         found_problem = TRUE;
15462         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15463         complement = 1;
15464         p++;
15465
15466         if (isBLANK(*p)) {
15467             found_problem = TRUE;
15468
15469             do {
15470                 p++;
15471             } while (p < e && isBLANK(*p));
15472
15473             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15474         }
15475     }
15476
15477     /* But the first character should be a colon, which they could have easily
15478      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15479      * distinguish from a colon, so treat that as a colon).  */
15480     if (*p == ':') {
15481         p++;
15482         has_opening_colon = TRUE;
15483     }
15484     else if (*p == ';') {
15485         found_problem = TRUE;
15486         p++;
15487         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15488         has_opening_colon = TRUE;
15489     }
15490     else {
15491         found_problem = TRUE;
15492         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15493
15494         /* Consider an initial punctuation (not one of the recognized ones) to
15495          * be a left terminator */
15496         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15497             p++;
15498         }
15499     }
15500
15501     /* They may think that you can put spaces between the components */
15502     if (isBLANK(*p)) {
15503         found_problem = TRUE;
15504
15505         do {
15506             p++;
15507         } while (p < e && isBLANK(*p));
15508
15509         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15510     }
15511
15512     if (*p == '^') {
15513
15514         /* We consider something like [^:^alnum:]] to not have been intended to
15515          * be a posix class, but XXX maybe we should */
15516         if (complement) {
15517             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15518         }
15519
15520         complement = 1;
15521         p++;
15522     }
15523
15524     /* Again, they may think that you can put spaces between the components */
15525     if (isBLANK(*p)) {
15526         found_problem = TRUE;
15527
15528         do {
15529             p++;
15530         } while (p < e && isBLANK(*p));
15531
15532         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15533     }
15534
15535     if (*p == ']') {
15536
15537         /* XXX This ']' may be a typo, and something else was meant.  But
15538          * treating it as such creates enough complications, that that
15539          * possibility isn't currently considered here.  So we assume that the
15540          * ']' is what is intended, and if we've already found an initial '[',
15541          * this leaves this construct looking like [:] or [:^], which almost
15542          * certainly weren't intended to be posix classes */
15543         if (has_opening_bracket) {
15544             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15545         }
15546
15547         /* But this function can be called when we parse the colon for
15548          * something like qr/[alpha:]]/, so we back up to look for the
15549          * beginning */
15550         p--;
15551
15552         if (*p == ';') {
15553             found_problem = TRUE;
15554             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15555         }
15556         else if (*p != ':') {
15557
15558             /* XXX We are currently very restrictive here, so this code doesn't
15559              * consider the possibility that, say, /[alpha.]]/ was intended to
15560              * be a posix class. */
15561             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15562         }
15563
15564         /* Here we have something like 'foo:]'.  There was no initial colon,
15565          * and we back up over 'foo.  XXX Unlike the going forward case, we
15566          * don't handle typos of non-word chars in the middle */
15567         has_opening_colon = FALSE;
15568         p--;
15569
15570         while (p > RExC_start && isWORDCHAR(*p)) {
15571             p--;
15572         }
15573         p++;
15574
15575         /* Here, we have positioned ourselves to where we think the first
15576          * character in the potential class is */
15577     }
15578
15579     /* Now the interior really starts.  There are certain key characters that
15580      * can end the interior, or these could just be typos.  To catch both
15581      * cases, we may have to do two passes.  In the first pass, we keep on
15582      * going unless we come to a sequence that matches
15583      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15584      * This means it takes a sequence to end the pass, so two typos in a row if
15585      * that wasn't what was intended.  If the class is perfectly formed, just
15586      * this one pass is needed.  We also stop if there are too many characters
15587      * being accumulated, but this number is deliberately set higher than any
15588      * real class.  It is set high enough so that someone who thinks that
15589      * 'alphanumeric' is a correct name would get warned that it wasn't.
15590      * While doing the pass, we keep track of where the key characters were in
15591      * it.  If we don't find an end to the class, and one of the key characters
15592      * was found, we redo the pass, but stop when we get to that character.
15593      * Thus the key character was considered a typo in the first pass, but a
15594      * terminator in the second.  If two key characters are found, we stop at
15595      * the second one in the first pass.  Again this can miss two typos, but
15596      * catches a single one
15597      *
15598      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15599      * point to the first key character.  For the second pass, it starts as -1.
15600      * */
15601
15602     name_start = p;
15603   parse_name:
15604     {
15605         bool has_blank               = FALSE;
15606         bool has_upper               = FALSE;
15607         bool has_terminating_colon   = FALSE;
15608         bool has_terminating_bracket = FALSE;
15609         bool has_semi_colon          = FALSE;
15610         unsigned int name_len        = 0;
15611         int punct_count              = 0;
15612
15613         while (p < e) {
15614
15615             /* Squeeze out blanks when looking up the class name below */
15616             if (isBLANK(*p) ) {
15617                 has_blank = TRUE;
15618                 found_problem = TRUE;
15619                 p++;
15620                 continue;
15621             }
15622
15623             /* The name will end with a punctuation */
15624             if (isPUNCT(*p)) {
15625                 const char * peek = p + 1;
15626
15627                 /* Treat any non-']' punctuation followed by a ']' (possibly
15628                  * with intervening blanks) as trying to terminate the class.
15629                  * ']]' is very likely to mean a class was intended (but
15630                  * missing the colon), but the warning message that gets
15631                  * generated shows the error position better if we exit the
15632                  * loop at the bottom (eventually), so skip it here. */
15633                 if (*p != ']') {
15634                     if (peek < e && isBLANK(*peek)) {
15635                         has_blank = TRUE;
15636                         found_problem = TRUE;
15637                         do {
15638                             peek++;
15639                         } while (peek < e && isBLANK(*peek));
15640                     }
15641
15642                     if (peek < e && *peek == ']') {
15643                         has_terminating_bracket = TRUE;
15644                         if (*p == ':') {
15645                             has_terminating_colon = TRUE;
15646                         }
15647                         else if (*p == ';') {
15648                             has_semi_colon = TRUE;
15649                             has_terminating_colon = TRUE;
15650                         }
15651                         else {
15652                             found_problem = TRUE;
15653                         }
15654                         p = peek + 1;
15655                         goto try_posix;
15656                     }
15657                 }
15658
15659                 /* Here we have punctuation we thought didn't end the class.
15660                  * Keep track of the position of the key characters that are
15661                  * more likely to have been class-enders */
15662                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15663
15664                     /* Allow just one such possible class-ender not actually
15665                      * ending the class. */
15666                     if (possible_end) {
15667                         break;
15668                     }
15669                     possible_end = p;
15670                 }
15671
15672                 /* If we have too many punctuation characters, no use in
15673                  * keeping going */
15674                 if (++punct_count > max_distance) {
15675                     break;
15676                 }
15677
15678                 /* Treat the punctuation as a typo. */
15679                 input_text[name_len++] = *p;
15680                 p++;
15681             }
15682             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15683                 input_text[name_len++] = toLOWER(*p);
15684                 has_upper = TRUE;
15685                 found_problem = TRUE;
15686                 p++;
15687             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15688                 input_text[name_len++] = *p;
15689                 p++;
15690             }
15691             else {
15692                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15693                 p+= UTF8SKIP(p);
15694             }
15695
15696             /* The declaration of 'input_text' is how long we allow a potential
15697              * class name to be, before saying they didn't mean a class name at
15698              * all */
15699             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15700                 break;
15701             }
15702         }
15703
15704         /* We get to here when the possible class name hasn't been properly
15705          * terminated before:
15706          *   1) we ran off the end of the pattern; or
15707          *   2) found two characters, each of which might have been intended to
15708          *      be the name's terminator
15709          *   3) found so many punctuation characters in the purported name,
15710          *      that the edit distance to a valid one is exceeded
15711          *   4) we decided it was more characters than anyone could have
15712          *      intended to be one. */
15713
15714         found_problem = TRUE;
15715
15716         /* In the final two cases, we know that looking up what we've
15717          * accumulated won't lead to a match, even a fuzzy one. */
15718         if (   name_len >= C_ARRAY_LENGTH(input_text)
15719             || punct_count > max_distance)
15720         {
15721             /* If there was an intermediate key character that could have been
15722              * an intended end, redo the parse, but stop there */
15723             if (possible_end && possible_end != (char *) -1) {
15724                 possible_end = (char *) -1; /* Special signal value to say
15725                                                we've done a first pass */
15726                 p = name_start;
15727                 goto parse_name;
15728             }
15729
15730             /* Otherwise, it can't have meant to have been a class */
15731             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15732         }
15733
15734         /* If we ran off the end, and the final character was a punctuation
15735          * one, back up one, to look at that final one just below.  Later, we
15736          * will restore the parse pointer if appropriate */
15737         if (name_len && p == e && isPUNCT(*(p-1))) {
15738             p--;
15739             name_len--;
15740         }
15741
15742         if (p < e && isPUNCT(*p)) {
15743             if (*p == ']') {
15744                 has_terminating_bracket = TRUE;
15745
15746                 /* If this is a 2nd ']', and the first one is just below this
15747                  * one, consider that to be the real terminator.  This gives a
15748                  * uniform and better positioning for the warning message  */
15749                 if (   possible_end
15750                     && possible_end != (char *) -1
15751                     && *possible_end == ']'
15752                     && name_len && input_text[name_len - 1] == ']')
15753                 {
15754                     name_len--;
15755                     p = possible_end;
15756
15757                     /* And this is actually equivalent to having done the 2nd
15758                      * pass now, so set it to not try again */
15759                     possible_end = (char *) -1;
15760                 }
15761             }
15762             else {
15763                 if (*p == ':') {
15764                     has_terminating_colon = TRUE;
15765                 }
15766                 else if (*p == ';') {
15767                     has_semi_colon = TRUE;
15768                     has_terminating_colon = TRUE;
15769                 }
15770                 p++;
15771             }
15772         }
15773
15774     try_posix:
15775
15776         /* Here, we have a class name to look up.  We can short circuit the
15777          * stuff below for short names that can't possibly be meant to be a
15778          * class name.  (We can do this on the first pass, as any second pass
15779          * will yield an even shorter name) */
15780         if (name_len < 3) {
15781             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15782         }
15783
15784         /* Find which class it is.  Initially switch on the length of the name.
15785          * */
15786         switch (name_len) {
15787             case 4:
15788                 if (memEQs(name_start, 4, "word")) {
15789                     /* this is not POSIX, this is the Perl \w */
15790                     class_number = ANYOF_WORDCHAR;
15791                 }
15792                 break;
15793             case 5:
15794                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15795                  *                        graph lower print punct space upper
15796                  * Offset 4 gives the best switch position.  */
15797                 switch (name_start[4]) {
15798                     case 'a':
15799                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
15800                             class_number = ANYOF_ALPHA;
15801                         break;
15802                     case 'e':
15803                         if (memBEGINs(name_start, 5, "spac")) /* space */
15804                             class_number = ANYOF_SPACE;
15805                         break;
15806                     case 'h':
15807                         if (memBEGINs(name_start, 5, "grap")) /* graph */
15808                             class_number = ANYOF_GRAPH;
15809                         break;
15810                     case 'i':
15811                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
15812                             class_number = ANYOF_ASCII;
15813                         break;
15814                     case 'k':
15815                         if (memBEGINs(name_start, 5, "blan")) /* blank */
15816                             class_number = ANYOF_BLANK;
15817                         break;
15818                     case 'l':
15819                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15820                             class_number = ANYOF_CNTRL;
15821                         break;
15822                     case 'm':
15823                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15824                             class_number = ANYOF_ALPHANUMERIC;
15825                         break;
15826                     case 'r':
15827                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
15828                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15829                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15830                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15831                         break;
15832                     case 't':
15833                         if (memBEGINs(name_start, 5, "digi")) /* digit */
15834                             class_number = ANYOF_DIGIT;
15835                         else if (memBEGINs(name_start, 5, "prin")) /* print */
15836                             class_number = ANYOF_PRINT;
15837                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
15838                             class_number = ANYOF_PUNCT;
15839                         break;
15840                 }
15841                 break;
15842             case 6:
15843                 if (memEQs(name_start, 6, "xdigit"))
15844                     class_number = ANYOF_XDIGIT;
15845                 break;
15846         }
15847
15848         /* If the name exactly matches a posix class name the class number will
15849          * here be set to it, and the input almost certainly was meant to be a
15850          * posix class, so we can skip further checking.  If instead the syntax
15851          * is exactly correct, but the name isn't one of the legal ones, we
15852          * will return that as an error below.  But if neither of these apply,
15853          * it could be that no posix class was intended at all, or that one
15854          * was, but there was a typo.  We tease these apart by doing fuzzy
15855          * matching on the name */
15856         if (class_number == OOB_NAMEDCLASS && found_problem) {
15857             const UV posix_names[][6] = {
15858                                                 { 'a', 'l', 'n', 'u', 'm' },
15859                                                 { 'a', 'l', 'p', 'h', 'a' },
15860                                                 { 'a', 's', 'c', 'i', 'i' },
15861                                                 { 'b', 'l', 'a', 'n', 'k' },
15862                                                 { 'c', 'n', 't', 'r', 'l' },
15863                                                 { 'd', 'i', 'g', 'i', 't' },
15864                                                 { 'g', 'r', 'a', 'p', 'h' },
15865                                                 { 'l', 'o', 'w', 'e', 'r' },
15866                                                 { 'p', 'r', 'i', 'n', 't' },
15867                                                 { 'p', 'u', 'n', 'c', 't' },
15868                                                 { 's', 'p', 'a', 'c', 'e' },
15869                                                 { 'u', 'p', 'p', 'e', 'r' },
15870                                                 { 'w', 'o', 'r', 'd' },
15871                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
15872                                             };
15873             /* The names of the above all have added NULs to make them the same
15874              * size, so we need to also have the real lengths */
15875             const UV posix_name_lengths[] = {
15876                                                 sizeof("alnum") - 1,
15877                                                 sizeof("alpha") - 1,
15878                                                 sizeof("ascii") - 1,
15879                                                 sizeof("blank") - 1,
15880                                                 sizeof("cntrl") - 1,
15881                                                 sizeof("digit") - 1,
15882                                                 sizeof("graph") - 1,
15883                                                 sizeof("lower") - 1,
15884                                                 sizeof("print") - 1,
15885                                                 sizeof("punct") - 1,
15886                                                 sizeof("space") - 1,
15887                                                 sizeof("upper") - 1,
15888                                                 sizeof("word")  - 1,
15889                                                 sizeof("xdigit")- 1
15890                                             };
15891             unsigned int i;
15892             int temp_max = max_distance;    /* Use a temporary, so if we
15893                                                reparse, we haven't changed the
15894                                                outer one */
15895
15896             /* Use a smaller max edit distance if we are missing one of the
15897              * delimiters */
15898             if (   has_opening_bracket + has_opening_colon < 2
15899                 || has_terminating_bracket + has_terminating_colon < 2)
15900             {
15901                 temp_max--;
15902             }
15903
15904             /* See if the input name is close to a legal one */
15905             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15906
15907                 /* Short circuit call if the lengths are too far apart to be
15908                  * able to match */
15909                 if (abs( (int) (name_len - posix_name_lengths[i]))
15910                     > temp_max)
15911                 {
15912                     continue;
15913                 }
15914
15915                 if (edit_distance(input_text,
15916                                   posix_names[i],
15917                                   name_len,
15918                                   posix_name_lengths[i],
15919                                   temp_max
15920                                  )
15921                     > -1)
15922                 { /* If it is close, it probably was intended to be a class */
15923                     goto probably_meant_to_be;
15924                 }
15925             }
15926
15927             /* Here the input name is not close enough to a valid class name
15928              * for us to consider it to be intended to be a posix class.  If
15929              * we haven't already done so, and the parse found a character that
15930              * could have been terminators for the name, but which we absorbed
15931              * as typos during the first pass, repeat the parse, signalling it
15932              * to stop at that character */
15933             if (possible_end && possible_end != (char *) -1) {
15934                 possible_end = (char *) -1;
15935                 p = name_start;
15936                 goto parse_name;
15937             }
15938
15939             /* Here neither pass found a close-enough class name */
15940             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15941         }
15942
15943     probably_meant_to_be:
15944
15945         /* Here we think that a posix specification was intended.  Update any
15946          * parse pointer */
15947         if (updated_parse_ptr) {
15948             *updated_parse_ptr = (char *) p;
15949         }
15950
15951         /* If a posix class name was intended but incorrectly specified, we
15952          * output or return the warnings */
15953         if (found_problem) {
15954
15955             /* We set flags for these issues in the parse loop above instead of
15956              * adding them to the list of warnings, because we can parse it
15957              * twice, and we only want one warning instance */
15958             if (has_upper) {
15959                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15960             }
15961             if (has_blank) {
15962                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15963             }
15964             if (has_semi_colon) {
15965                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15966             }
15967             else if (! has_terminating_colon) {
15968                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15969             }
15970             if (! has_terminating_bracket) {
15971                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15972             }
15973
15974             if (   posix_warnings
15975                 && RExC_warn_text
15976                 && av_top_index(RExC_warn_text) > -1)
15977             {
15978                 *posix_warnings = RExC_warn_text;
15979             }
15980         }
15981         else if (class_number != OOB_NAMEDCLASS) {
15982             /* If it is a known class, return the class.  The class number
15983              * #defines are structured so each complement is +1 to the normal
15984              * one */
15985             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15986         }
15987         else if (! check_only) {
15988
15989             /* Here, it is an unrecognized class.  This is an error (unless the
15990             * call is to check only, which we've already handled above) */
15991             const char * const complement_string = (complement)
15992                                                    ? "^"
15993                                                    : "";
15994             RExC_parse = (char *) p;
15995             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15996                         complement_string,
15997                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15998         }
15999     }
16000
16001     return OOB_NAMEDCLASS;
16002 }
16003 #undef ADD_POSIX_WARNING
16004
16005 STATIC unsigned  int
16006 S_regex_set_precedence(const U8 my_operator) {
16007
16008     /* Returns the precedence in the (?[...]) construct of the input operator,
16009      * specified by its character representation.  The precedence follows
16010      * general Perl rules, but it extends this so that ')' and ']' have (low)
16011      * precedence even though they aren't really operators */
16012
16013     switch (my_operator) {
16014         case '!':
16015             return 5;
16016         case '&':
16017             return 4;
16018         case '^':
16019         case '|':
16020         case '+':
16021         case '-':
16022             return 3;
16023         case ')':
16024             return 2;
16025         case ']':
16026             return 1;
16027     }
16028
16029     NOT_REACHED; /* NOTREACHED */
16030     return 0;   /* Silence compiler warning */
16031 }
16032
16033 STATIC regnode_offset
16034 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16035                     I32 *flagp, U32 depth,
16036                     char * const oregcomp_parse)
16037 {
16038     /* Handle the (?[...]) construct to do set operations */
16039
16040     U8 curchar;                     /* Current character being parsed */
16041     UV start, end;                  /* End points of code point ranges */
16042     SV* final = NULL;               /* The end result inversion list */
16043     SV* result_string;              /* 'final' stringified */
16044     AV* stack;                      /* stack of operators and operands not yet
16045                                        resolved */
16046     AV* fence_stack = NULL;         /* A stack containing the positions in
16047                                        'stack' of where the undealt-with left
16048                                        parens would be if they were actually
16049                                        put there */
16050     /* The 'volatile' is a workaround for an optimiser bug
16051      * in Solaris Studio 12.3. See RT #127455 */
16052     volatile IV fence = 0;          /* Position of where most recent undealt-
16053                                        with left paren in stack is; -1 if none.
16054                                      */
16055     STRLEN len;                     /* Temporary */
16056     regnode_offset node;                  /* Temporary, and final regnode returned by
16057                                        this function */
16058     const bool save_fold = FOLD;    /* Temporary */
16059     char *save_end, *save_parse;    /* Temporaries */
16060     const bool in_locale = LOC;     /* we turn off /l during processing */
16061
16062     GET_RE_DEBUG_FLAGS_DECL;
16063
16064     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16065
16066     DEBUG_PARSE("xcls");
16067
16068     if (in_locale) {
16069         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16070     }
16071
16072     /* The use of this operator implies /u.  This is required so that the
16073      * compile time values are valid in all runtime cases */
16074     REQUIRE_UNI_RULES(flagp, 0);
16075
16076     ckWARNexperimental(RExC_parse,
16077                        WARN_EXPERIMENTAL__REGEX_SETS,
16078                        "The regex_sets feature is experimental");
16079
16080     /* Everything in this construct is a metacharacter.  Operands begin with
16081      * either a '\' (for an escape sequence), or a '[' for a bracketed
16082      * character class.  Any other character should be an operator, or
16083      * parenthesis for grouping.  Both types of operands are handled by calling
16084      * regclass() to parse them.  It is called with a parameter to indicate to
16085      * return the computed inversion list.  The parsing here is implemented via
16086      * a stack.  Each entry on the stack is a single character representing one
16087      * of the operators; or else a pointer to an operand inversion list. */
16088
16089 #define IS_OPERATOR(a) SvIOK(a)
16090 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
16091
16092     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
16093      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16094      * with pronouncing it called it Reverse Polish instead, but now that YOU
16095      * know how to pronounce it you can use the correct term, thus giving due
16096      * credit to the person who invented it, and impressing your geek friends.
16097      * Wikipedia says that the pronounciation of "Ł" has been changing so that
16098      * it is now more like an English initial W (as in wonk) than an L.)
16099      *
16100      * This means that, for example, 'a | b & c' is stored on the stack as
16101      *
16102      * c  [4]
16103      * b  [3]
16104      * &  [2]
16105      * a  [1]
16106      * |  [0]
16107      *
16108      * where the numbers in brackets give the stack [array] element number.
16109      * In this implementation, parentheses are not stored on the stack.
16110      * Instead a '(' creates a "fence" so that the part of the stack below the
16111      * fence is invisible except to the corresponding ')' (this allows us to
16112      * replace testing for parens, by using instead subtraction of the fence
16113      * position).  As new operands are processed they are pushed onto the stack
16114      * (except as noted in the next paragraph).  New operators of higher
16115      * precedence than the current final one are inserted on the stack before
16116      * the lhs operand (so that when the rhs is pushed next, everything will be
16117      * in the correct positions shown above.  When an operator of equal or
16118      * lower precedence is encountered in parsing, all the stacked operations
16119      * of equal or higher precedence are evaluated, leaving the result as the
16120      * top entry on the stack.  This makes higher precedence operations
16121      * evaluate before lower precedence ones, and causes operations of equal
16122      * precedence to left associate.
16123      *
16124      * The only unary operator '!' is immediately pushed onto the stack when
16125      * encountered.  When an operand is encountered, if the top of the stack is
16126      * a '!", the complement is immediately performed, and the '!' popped.  The
16127      * resulting value is treated as a new operand, and the logic in the
16128      * previous paragraph is executed.  Thus in the expression
16129      *      [a] + ! [b]
16130      * the stack looks like
16131      *
16132      * !
16133      * a
16134      * +
16135      *
16136      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16137      * becomes
16138      *
16139      * !b
16140      * a
16141      * +
16142      *
16143      * A ')' is treated as an operator with lower precedence than all the
16144      * aforementioned ones, which causes all operations on the stack above the
16145      * corresponding '(' to be evaluated down to a single resultant operand.
16146      * Then the fence for the '(' is removed, and the operand goes through the
16147      * algorithm above, without the fence.
16148      *
16149      * A separate stack is kept of the fence positions, so that the position of
16150      * the latest so-far unbalanced '(' is at the top of it.
16151      *
16152      * The ']' ending the construct is treated as the lowest operator of all,
16153      * so that everything gets evaluated down to a single operand, which is the
16154      * result */
16155
16156     sv_2mortal((SV *)(stack = newAV()));
16157     sv_2mortal((SV *)(fence_stack = newAV()));
16158
16159     while (RExC_parse < RExC_end) {
16160         I32 top_index;              /* Index of top-most element in 'stack' */
16161         SV** top_ptr;               /* Pointer to top 'stack' element */
16162         SV* current = NULL;         /* To contain the current inversion list
16163                                        operand */
16164         SV* only_to_avoid_leaks;
16165
16166         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16167                                 TRUE /* Force /x */ );
16168         if (RExC_parse >= RExC_end) {   /* Fail */
16169             break;
16170         }
16171
16172         curchar = UCHARAT(RExC_parse);
16173
16174 redo_curchar:
16175
16176 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16177                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16178         DEBUG_U(dump_regex_sets_structures(pRExC_state,
16179                                            stack, fence, fence_stack));
16180 #endif
16181
16182         top_index = av_tindex_skip_len_mg(stack);
16183
16184         switch (curchar) {
16185             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
16186             char stacked_operator;  /* The topmost operator on the 'stack'. */
16187             SV* lhs;                /* Operand to the left of the operator */
16188             SV* rhs;                /* Operand to the right of the operator */
16189             SV* fence_ptr;          /* Pointer to top element of the fence
16190                                        stack */
16191
16192             case '(':
16193
16194                 if (   RExC_parse < RExC_end - 2
16195                     && UCHARAT(RExC_parse + 1) == '?'
16196                     && UCHARAT(RExC_parse + 2) == '^')
16197                 {
16198                     /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
16199                      * This happens when we have some thing like
16200                      *
16201                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16202                      *   ...
16203                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
16204                      *
16205                      * Here we would be handling the interpolated
16206                      * '$thai_or_lao'.  We handle this by a recursive call to
16207                      * ourselves which returns the inversion list the
16208                      * interpolated expression evaluates to.  We use the flags
16209                      * from the interpolated pattern. */
16210                     U32 save_flags = RExC_flags;
16211                     const char * save_parse;
16212
16213                     RExC_parse += 2;        /* Skip past the '(?' */
16214                     save_parse = RExC_parse;
16215
16216                     /* Parse the flags for the '(?'.  We already know the first
16217                      * flag to parse is a '^' */
16218                     parse_lparen_question_flags(pRExC_state);
16219
16220                     if (   RExC_parse >= RExC_end - 4
16221                         || UCHARAT(RExC_parse) != ':'
16222                         || UCHARAT(++RExC_parse) != '('
16223                         || UCHARAT(++RExC_parse) != '?'
16224                         || UCHARAT(++RExC_parse) != '[')
16225                     {
16226
16227                         /* In combination with the above, this moves the
16228                          * pointer to the point just after the first erroneous
16229                          * character. */
16230                         if (RExC_parse >= RExC_end - 4) {
16231                             RExC_parse = RExC_end;
16232                         }
16233                         else if (RExC_parse != save_parse) {
16234                             RExC_parse += (UTF)
16235                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
16236                                           : 1;
16237                         }
16238                         vFAIL("Expecting '(?flags:(?[...'");
16239                     }
16240
16241                     /* Recurse, with the meat of the embedded expression */
16242                     RExC_parse++;
16243                     if (! handle_regex_sets(pRExC_state, &current, flagp,
16244                                                     depth+1, oregcomp_parse))
16245                     {
16246                         RETURN_FAIL_ON_RESTART(*flagp, flagp);
16247                     }
16248
16249                     /* Here, 'current' contains the embedded expression's
16250                      * inversion list, and RExC_parse points to the trailing
16251                      * ']'; the next character should be the ')' */
16252                     RExC_parse++;
16253                     if (UCHARAT(RExC_parse) != ')')
16254                         vFAIL("Expecting close paren for nested extended charclass");
16255
16256                     /* Then the ')' matching the original '(' handled by this
16257                      * case: statement */
16258                     RExC_parse++;
16259                     if (UCHARAT(RExC_parse) != ')')
16260                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
16261
16262                     RExC_flags = save_flags;
16263                     goto handle_operand;
16264                 }
16265
16266                 /* A regular '('.  Look behind for illegal syntax */
16267                 if (top_index - fence >= 0) {
16268                     /* If the top entry on the stack is an operator, it had
16269                      * better be a '!', otherwise the entry below the top
16270                      * operand should be an operator */
16271                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16272                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16273                         || (   IS_OPERAND(*top_ptr)
16274                             && (   top_index - fence < 1
16275                                 || ! (stacked_ptr = av_fetch(stack,
16276                                                              top_index - 1,
16277                                                              FALSE))
16278                                 || ! IS_OPERATOR(*stacked_ptr))))
16279                     {
16280                         RExC_parse++;
16281                         vFAIL("Unexpected '(' with no preceding operator");
16282                     }
16283                 }
16284
16285                 /* Stack the position of this undealt-with left paren */
16286                 av_push(fence_stack, newSViv(fence));
16287                 fence = top_index + 1;
16288                 break;
16289
16290             case '\\':
16291                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16292                  * multi-char folds are allowed.  */
16293                 if (!regclass(pRExC_state, flagp, depth+1,
16294                               TRUE, /* means parse just the next thing */
16295                               FALSE, /* don't allow multi-char folds */
16296                               FALSE, /* don't silence non-portable warnings.  */
16297                               TRUE,  /* strict */
16298                               FALSE, /* Require return to be an ANYOF */
16299                               &current))
16300                 {
16301                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16302                     goto regclass_failed;
16303                 }
16304
16305                 /* regclass() will return with parsing just the \ sequence,
16306                  * leaving the parse pointer at the next thing to parse */
16307                 RExC_parse--;
16308                 goto handle_operand;
16309
16310             case '[':   /* Is a bracketed character class */
16311             {
16312                 /* See if this is a [:posix:] class. */
16313                 bool is_posix_class = (OOB_NAMEDCLASS
16314                             < handle_possible_posix(pRExC_state,
16315                                                 RExC_parse + 1,
16316                                                 NULL,
16317                                                 NULL,
16318                                                 TRUE /* checking only */));
16319                 /* If it is a posix class, leave the parse pointer at the '['
16320                  * to fool regclass() into thinking it is part of a
16321                  * '[[:posix:]]'. */
16322                 if (! is_posix_class) {
16323                     RExC_parse++;
16324                 }
16325
16326                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16327                  * multi-char folds are allowed.  */
16328                 if (!regclass(pRExC_state, flagp, depth+1,
16329                                 is_posix_class, /* parse the whole char
16330                                                     class only if not a
16331                                                     posix class */
16332                                 FALSE, /* don't allow multi-char folds */
16333                                 TRUE, /* silence non-portable warnings. */
16334                                 TRUE, /* strict */
16335                                 FALSE, /* Require return to be an ANYOF */
16336                                 &current))
16337                 {
16338                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16339                     goto regclass_failed;
16340                 }
16341
16342                 if (! current) {
16343                     break;
16344                 }
16345
16346                 /* function call leaves parse pointing to the ']', except if we
16347                  * faked it */
16348                 if (is_posix_class) {
16349                     RExC_parse--;
16350                 }
16351
16352                 goto handle_operand;
16353             }
16354
16355             case ']':
16356                 if (top_index >= 1) {
16357                     goto join_operators;
16358                 }
16359
16360                 /* Only a single operand on the stack: are done */
16361                 goto done;
16362
16363             case ')':
16364                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16365                     if (UCHARAT(RExC_parse - 1) == ']')  {
16366                         break;
16367                     }
16368                     RExC_parse++;
16369                     vFAIL("Unexpected ')'");
16370                 }
16371
16372                 /* If nothing after the fence, is missing an operand */
16373                 if (top_index - fence < 0) {
16374                     RExC_parse++;
16375                     goto bad_syntax;
16376                 }
16377                 /* If at least two things on the stack, treat this as an
16378                   * operator */
16379                 if (top_index - fence >= 1) {
16380                     goto join_operators;
16381                 }
16382
16383                 /* Here only a single thing on the fenced stack, and there is a
16384                  * fence.  Get rid of it */
16385                 fence_ptr = av_pop(fence_stack);
16386                 assert(fence_ptr);
16387                 fence = SvIV(fence_ptr);
16388                 SvREFCNT_dec_NN(fence_ptr);
16389                 fence_ptr = NULL;
16390
16391                 if (fence < 0) {
16392                     fence = 0;
16393                 }
16394
16395                 /* Having gotten rid of the fence, we pop the operand at the
16396                  * stack top and process it as a newly encountered operand */
16397                 current = av_pop(stack);
16398                 if (IS_OPERAND(current)) {
16399                     goto handle_operand;
16400                 }
16401
16402                 RExC_parse++;
16403                 goto bad_syntax;
16404
16405             case '&':
16406             case '|':
16407             case '+':
16408             case '-':
16409             case '^':
16410
16411                 /* These binary operators should have a left operand already
16412                  * parsed */
16413                 if (   top_index - fence < 0
16414                     || top_index - fence == 1
16415                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16416                     || ! IS_OPERAND(*top_ptr))
16417                 {
16418                     goto unexpected_binary;
16419                 }
16420
16421                 /* If only the one operand is on the part of the stack visible
16422                  * to us, we just place this operator in the proper position */
16423                 if (top_index - fence < 2) {
16424
16425                     /* Place the operator before the operand */
16426
16427                     SV* lhs = av_pop(stack);
16428                     av_push(stack, newSVuv(curchar));
16429                     av_push(stack, lhs);
16430                     break;
16431                 }
16432
16433                 /* But if there is something else on the stack, we need to
16434                  * process it before this new operator if and only if the
16435                  * stacked operation has equal or higher precedence than the
16436                  * new one */
16437
16438              join_operators:
16439
16440                 /* The operator on the stack is supposed to be below both its
16441                  * operands */
16442                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16443                     || IS_OPERAND(*stacked_ptr))
16444                 {
16445                     /* But if not, it's legal and indicates we are completely
16446                      * done if and only if we're currently processing a ']',
16447                      * which should be the final thing in the expression */
16448                     if (curchar == ']') {
16449                         goto done;
16450                     }
16451
16452                   unexpected_binary:
16453                     RExC_parse++;
16454                     vFAIL2("Unexpected binary operator '%c' with no "
16455                            "preceding operand", curchar);
16456                 }
16457                 stacked_operator = (char) SvUV(*stacked_ptr);
16458
16459                 if (regex_set_precedence(curchar)
16460                     > regex_set_precedence(stacked_operator))
16461                 {
16462                     /* Here, the new operator has higher precedence than the
16463                      * stacked one.  This means we need to add the new one to
16464                      * the stack to await its rhs operand (and maybe more
16465                      * stuff).  We put it before the lhs operand, leaving
16466                      * untouched the stacked operator and everything below it
16467                      * */
16468                     lhs = av_pop(stack);
16469                     assert(IS_OPERAND(lhs));
16470
16471                     av_push(stack, newSVuv(curchar));
16472                     av_push(stack, lhs);
16473                     break;
16474                 }
16475
16476                 /* Here, the new operator has equal or lower precedence than
16477                  * what's already there.  This means the operation already
16478                  * there should be performed now, before the new one. */
16479
16480                 rhs = av_pop(stack);
16481                 if (! IS_OPERAND(rhs)) {
16482
16483                     /* This can happen when a ! is not followed by an operand,
16484                      * like in /(?[\t &!])/ */
16485                     goto bad_syntax;
16486                 }
16487
16488                 lhs = av_pop(stack);
16489
16490                 if (! IS_OPERAND(lhs)) {
16491
16492                     /* This can happen when there is an empty (), like in
16493                      * /(?[[0]+()+])/ */
16494                     goto bad_syntax;
16495                 }
16496
16497                 switch (stacked_operator) {
16498                     case '&':
16499                         _invlist_intersection(lhs, rhs, &rhs);
16500                         break;
16501
16502                     case '|':
16503                     case '+':
16504                         _invlist_union(lhs, rhs, &rhs);
16505                         break;
16506
16507                     case '-':
16508                         _invlist_subtract(lhs, rhs, &rhs);
16509                         break;
16510
16511                     case '^':   /* The union minus the intersection */
16512                     {
16513                         SV* i = NULL;
16514                         SV* u = NULL;
16515
16516                         _invlist_union(lhs, rhs, &u);
16517                         _invlist_intersection(lhs, rhs, &i);
16518                         _invlist_subtract(u, i, &rhs);
16519                         SvREFCNT_dec_NN(i);
16520                         SvREFCNT_dec_NN(u);
16521                         break;
16522                     }
16523                 }
16524                 SvREFCNT_dec(lhs);
16525
16526                 /* Here, the higher precedence operation has been done, and the
16527                  * result is in 'rhs'.  We overwrite the stacked operator with
16528                  * the result.  Then we redo this code to either push the new
16529                  * operator onto the stack or perform any higher precedence
16530                  * stacked operation */
16531                 only_to_avoid_leaks = av_pop(stack);
16532                 SvREFCNT_dec(only_to_avoid_leaks);
16533                 av_push(stack, rhs);
16534                 goto redo_curchar;
16535
16536             case '!':   /* Highest priority, right associative */
16537
16538                 /* If what's already at the top of the stack is another '!",
16539                  * they just cancel each other out */
16540                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16541                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16542                 {
16543                     only_to_avoid_leaks = av_pop(stack);
16544                     SvREFCNT_dec(only_to_avoid_leaks);
16545                 }
16546                 else { /* Otherwise, since it's right associative, just push
16547                           onto the stack */
16548                     av_push(stack, newSVuv(curchar));
16549                 }
16550                 break;
16551
16552             default:
16553                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16554                 if (RExC_parse >= RExC_end) {
16555                     break;
16556                 }
16557                 vFAIL("Unexpected character");
16558
16559           handle_operand:
16560
16561             /* Here 'current' is the operand.  If something is already on the
16562              * stack, we have to check if it is a !.  But first, the code above
16563              * may have altered the stack in the time since we earlier set
16564              * 'top_index'.  */
16565
16566             top_index = av_tindex_skip_len_mg(stack);
16567             if (top_index - fence >= 0) {
16568                 /* If the top entry on the stack is an operator, it had better
16569                  * be a '!', otherwise the entry below the top operand should
16570                  * be an operator */
16571                 top_ptr = av_fetch(stack, top_index, FALSE);
16572                 assert(top_ptr);
16573                 if (IS_OPERATOR(*top_ptr)) {
16574
16575                     /* The only permissible operator at the top of the stack is
16576                      * '!', which is applied immediately to this operand. */
16577                     curchar = (char) SvUV(*top_ptr);
16578                     if (curchar != '!') {
16579                         SvREFCNT_dec(current);
16580                         vFAIL2("Unexpected binary operator '%c' with no "
16581                                 "preceding operand", curchar);
16582                     }
16583
16584                     _invlist_invert(current);
16585
16586                     only_to_avoid_leaks = av_pop(stack);
16587                     SvREFCNT_dec(only_to_avoid_leaks);
16588
16589                     /* And we redo with the inverted operand.  This allows
16590                      * handling multiple ! in a row */
16591                     goto handle_operand;
16592                 }
16593                           /* Single operand is ok only for the non-binary ')'
16594                            * operator */
16595                 else if ((top_index - fence == 0 && curchar != ')')
16596                          || (top_index - fence > 0
16597                              && (! (stacked_ptr = av_fetch(stack,
16598                                                            top_index - 1,
16599                                                            FALSE))
16600                                  || IS_OPERAND(*stacked_ptr))))
16601                 {
16602                     SvREFCNT_dec(current);
16603                     vFAIL("Operand with no preceding operator");
16604                 }
16605             }
16606
16607             /* Here there was nothing on the stack or the top element was
16608              * another operand.  Just add this new one */
16609             av_push(stack, current);
16610
16611         } /* End of switch on next parse token */
16612
16613         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16614     } /* End of loop parsing through the construct */
16615
16616     vFAIL("Syntax error in (?[...])");
16617
16618   done:
16619
16620     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16621         if (RExC_parse < RExC_end) {
16622             RExC_parse++;
16623         }
16624
16625         vFAIL("Unexpected ']' with no following ')' in (?[...");
16626     }
16627
16628     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16629         vFAIL("Unmatched (");
16630     }
16631
16632     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16633         || ((final = av_pop(stack)) == NULL)
16634         || ! IS_OPERAND(final)
16635         || ! is_invlist(final)
16636         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16637     {
16638       bad_syntax:
16639         SvREFCNT_dec(final);
16640         vFAIL("Incomplete expression within '(?[ ])'");
16641     }
16642
16643     /* Here, 'final' is the resultant inversion list from evaluating the
16644      * expression.  Return it if so requested */
16645     if (return_invlist) {
16646         *return_invlist = final;
16647         return END;
16648     }
16649
16650     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
16651      * expecting a string of ranges and individual code points */
16652     invlist_iterinit(final);
16653     result_string = newSVpvs("");
16654     while (invlist_iternext(final, &start, &end)) {
16655         if (start == end) {
16656             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16657         }
16658         else {
16659             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16660                                                      start,          end);
16661         }
16662     }
16663
16664     /* About to generate an ANYOF (or similar) node from the inversion list we
16665      * have calculated */
16666     save_parse = RExC_parse;
16667     RExC_parse = SvPV(result_string, len);
16668     save_end = RExC_end;
16669     RExC_end = RExC_parse + len;
16670     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16671
16672     /* We turn off folding around the call, as the class we have constructed
16673      * already has all folding taken into consideration, and we don't want
16674      * regclass() to add to that */
16675     RExC_flags &= ~RXf_PMf_FOLD;
16676     /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16677      * folds are allowed.  */
16678     node = regclass(pRExC_state, flagp, depth+1,
16679                     FALSE, /* means parse the whole char class */
16680                     FALSE, /* don't allow multi-char folds */
16681                     TRUE, /* silence non-portable warnings.  The above may very
16682                              well have generated non-portable code points, but
16683                              they're valid on this machine */
16684                     FALSE, /* similarly, no need for strict */
16685
16686                     /* We can optimize into something besides an ANYOF, except
16687                      * under /l, which needs to be ANYOF because of runtime
16688                      * checks for locale sanity, etc */
16689                   ! in_locale,
16690                     NULL
16691                 );
16692
16693     RESTORE_WARNINGS;
16694     RExC_parse = save_parse + 1;
16695     RExC_end = save_end;
16696     SvREFCNT_dec_NN(final);
16697     SvREFCNT_dec_NN(result_string);
16698
16699     if (save_fold) {
16700         RExC_flags |= RXf_PMf_FOLD;
16701     }
16702
16703     if (!node) {
16704         RETURN_FAIL_ON_RESTART(*flagp, flagp);
16705         goto regclass_failed;
16706     }
16707
16708     /* Fix up the node type if we are in locale.  (We have pretended we are
16709      * under /u for the purposes of regclass(), as this construct will only
16710      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
16711      * as to cause any warnings about bad locales to be output in regexec.c),
16712      * and add the flag that indicates to check if not in a UTF-8 locale.  The
16713      * reason we above forbid optimization into something other than an ANYOF
16714      * node is simply to minimize the number of code changes in regexec.c.
16715      * Otherwise we would have to create new EXACTish node types and deal with
16716      * them.  This decision could be revisited should this construct become
16717      * popular.
16718      *
16719      * (One might think we could look at the resulting ANYOF node and suppress
16720      * the flag if everything is above 255, as those would be UTF-8 only,
16721      * but this isn't true, as the components that led to that result could
16722      * have been locale-affected, and just happen to cancel each other out
16723      * under UTF-8 locales.) */
16724     if (in_locale) {
16725         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16726
16727         assert(OP(REGNODE_p(node)) == ANYOF);
16728
16729         OP(REGNODE_p(node)) = ANYOFL;
16730         ANYOF_FLAGS(REGNODE_p(node))
16731                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16732     }
16733
16734     nextchar(pRExC_state);
16735     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16736     return node;
16737
16738   regclass_failed:
16739     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16740                                                                 (UV) *flagp);
16741 }
16742
16743 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16744
16745 STATIC void
16746 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16747                              AV * stack, const IV fence, AV * fence_stack)
16748 {   /* Dumps the stacks in handle_regex_sets() */
16749
16750     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16751     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16752     SSize_t i;
16753
16754     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16755
16756     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16757
16758     if (stack_top < 0) {
16759         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16760     }
16761     else {
16762         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16763         for (i = stack_top; i >= 0; i--) {
16764             SV ** element_ptr = av_fetch(stack, i, FALSE);
16765             if (! element_ptr) {
16766             }
16767
16768             if (IS_OPERATOR(*element_ptr)) {
16769                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16770                                             (int) i, (int) SvIV(*element_ptr));
16771             }
16772             else {
16773                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16774                 sv_dump(*element_ptr);
16775             }
16776         }
16777     }
16778
16779     if (fence_stack_top < 0) {
16780         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16781     }
16782     else {
16783         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16784         for (i = fence_stack_top; i >= 0; i--) {
16785             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16786             if (! element_ptr) {
16787             }
16788
16789             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16790                                             (int) i, (int) SvIV(*element_ptr));
16791         }
16792     }
16793 }
16794
16795 #endif
16796
16797 #undef IS_OPERATOR
16798 #undef IS_OPERAND
16799
16800 STATIC void
16801 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16802 {
16803     /* This adds the Latin1/above-Latin1 folding rules.
16804      *
16805      * This should be called only for a Latin1-range code points, cp, which is
16806      * known to be involved in a simple fold with other code points above
16807      * Latin1.  It would give false results if /aa has been specified.
16808      * Multi-char folds are outside the scope of this, and must be handled
16809      * specially. */
16810
16811     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16812
16813     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16814
16815     /* The rules that are valid for all Unicode versions are hard-coded in */
16816     switch (cp) {
16817         case 'k':
16818         case 'K':
16819           *invlist =
16820              add_cp_to_invlist(*invlist, KELVIN_SIGN);
16821             break;
16822         case 's':
16823         case 'S':
16824           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16825             break;
16826         case MICRO_SIGN:
16827           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16828           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16829             break;
16830         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16831         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16832           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16833             break;
16834         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16835           *invlist = add_cp_to_invlist(*invlist,
16836                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16837             break;
16838
16839         default:    /* Other code points are checked against the data for the
16840                        current Unicode version */
16841           {
16842             Size_t folds_count;
16843             unsigned int first_fold;
16844             const unsigned int * remaining_folds;
16845             UV folded_cp;
16846
16847             if (isASCII(cp)) {
16848                 folded_cp = toFOLD(cp);
16849             }
16850             else {
16851                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16852                 Size_t dummy_len;
16853                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16854             }
16855
16856             if (folded_cp > 255) {
16857                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16858             }
16859
16860             folds_count = _inverse_folds(folded_cp, &first_fold,
16861                                                     &remaining_folds);
16862             if (folds_count == 0) {
16863
16864                 /* Use deprecated warning to increase the chances of this being
16865                  * output */
16866                 ckWARN2reg_d(RExC_parse,
16867                         "Perl folding rules are not up-to-date for 0x%02X;"
16868                         " please use the perlbug utility to report;", cp);
16869             }
16870             else {
16871                 unsigned int i;
16872
16873                 if (first_fold > 255) {
16874                     *invlist = add_cp_to_invlist(*invlist, first_fold);
16875                 }
16876                 for (i = 0; i < folds_count - 1; i++) {
16877                     if (remaining_folds[i] > 255) {
16878                         *invlist = add_cp_to_invlist(*invlist,
16879                                                     remaining_folds[i]);
16880                     }
16881                 }
16882             }
16883             break;
16884          }
16885     }
16886 }
16887
16888 STATIC void
16889 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16890 {
16891     /* Output the elements of the array given by '*posix_warnings' as REGEXP
16892      * warnings. */
16893
16894     SV * msg;
16895     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16896
16897     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16898
16899     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16900         return;
16901     }
16902
16903     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16904         if (first_is_fatal) {           /* Avoid leaking this */
16905             av_undef(posix_warnings);   /* This isn't necessary if the
16906                                             array is mortal, but is a
16907                                             fail-safe */
16908             (void) sv_2mortal(msg);
16909             PREPARE_TO_DIE;
16910         }
16911         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16912         SvREFCNT_dec_NN(msg);
16913     }
16914
16915     UPDATE_WARNINGS_LOC(RExC_parse);
16916 }
16917
16918 PERL_STATIC_INLINE Size_t
16919 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
16920 {
16921     const U8 * const start = s1;
16922     const U8 * const send = start + max;
16923
16924     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
16925
16926     while (s1 < send && *s1  == *s2) {
16927         s1++; s2++;
16928     }
16929
16930     return s1 - start;
16931 }
16932
16933
16934 STATIC AV *
16935 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16936 {
16937     /* This adds the string scalar <multi_string> to the array
16938      * <multi_char_matches>.  <multi_string> is known to have exactly
16939      * <cp_count> code points in it.  This is used when constructing a
16940      * bracketed character class and we find something that needs to match more
16941      * than a single character.
16942      *
16943      * <multi_char_matches> is actually an array of arrays.  Each top-level
16944      * element is an array that contains all the strings known so far that are
16945      * the same length.  And that length (in number of code points) is the same
16946      * as the index of the top-level array.  Hence, the [2] element is an
16947      * array, each element thereof is a string containing TWO code points;
16948      * while element [3] is for strings of THREE characters, and so on.  Since
16949      * this is for multi-char strings there can never be a [0] nor [1] element.
16950      *
16951      * When we rewrite the character class below, we will do so such that the
16952      * longest strings are written first, so that it prefers the longest
16953      * matching strings first.  This is done even if it turns out that any
16954      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
16955      * Christiansen has agreed that this is ok.  This makes the test for the
16956      * ligature 'ffi' come before the test for 'ff', for example */
16957
16958     AV* this_array;
16959     AV** this_array_ptr;
16960
16961     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16962
16963     if (! multi_char_matches) {
16964         multi_char_matches = newAV();
16965     }
16966
16967     if (av_exists(multi_char_matches, cp_count)) {
16968         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16969         this_array = *this_array_ptr;
16970     }
16971     else {
16972         this_array = newAV();
16973         av_store(multi_char_matches, cp_count,
16974                  (SV*) this_array);
16975     }
16976     av_push(this_array, multi_string);
16977
16978     return multi_char_matches;
16979 }
16980
16981 /* The names of properties whose definitions are not known at compile time are
16982  * stored in this SV, after a constant heading.  So if the length has been
16983  * changed since initialization, then there is a run-time definition. */
16984 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
16985                                         (SvCUR(listsv) != initial_listsv_len)
16986
16987 /* There is a restricted set of white space characters that are legal when
16988  * ignoring white space in a bracketed character class.  This generates the
16989  * code to skip them.
16990  *
16991  * There is a line below that uses the same white space criteria but is outside
16992  * this macro.  Both here and there must use the same definition */
16993 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
16994     STMT_START {                                                        \
16995         if (do_skip) {                                                  \
16996             while (isBLANK_A(UCHARAT(p)))                               \
16997             {                                                           \
16998                 p++;                                                    \
16999             }                                                           \
17000         }                                                               \
17001     } STMT_END
17002
17003 STATIC regnode_offset
17004 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17005                  const bool stop_at_1,  /* Just parse the next thing, don't
17006                                            look for a full character class */
17007                  bool allow_mutiple_chars,
17008                  const bool silence_non_portable,   /* Don't output warnings
17009                                                        about too large
17010                                                        characters */
17011                  const bool strict,
17012                  bool optimizable,                  /* ? Allow a non-ANYOF return
17013                                                        node */
17014                  SV** ret_invlist  /* Return an inversion list, not a node */
17015           )
17016 {
17017     /* parse a bracketed class specification.  Most of these will produce an
17018      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17019      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
17020      * under /i with multi-character folds: it will be rewritten following the
17021      * paradigm of this example, where the <multi-fold>s are characters which
17022      * fold to multiple character sequences:
17023      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17024      * gets effectively rewritten as:
17025      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17026      * reg() gets called (recursively) on the rewritten version, and this
17027      * function will return what it constructs.  (Actually the <multi-fold>s
17028      * aren't physically removed from the [abcdefghi], it's just that they are
17029      * ignored in the recursion by means of a flag:
17030      * <RExC_in_multi_char_class>.)
17031      *
17032      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17033      * characters, with the corresponding bit set if that character is in the
17034      * list.  For characters above this, an inversion list is used.  There
17035      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17036      * determinable at compile time
17037      *
17038      * On success, returns the offset at which any next node should be placed
17039      * into the regex engine program being compiled.
17040      *
17041      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17042      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17043      * UTF-8
17044      */
17045
17046     dVAR;
17047     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17048     IV range = 0;
17049     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17050     regnode_offset ret = -1;    /* Initialized to an illegal value */
17051     STRLEN numlen;
17052     int namedclass = OOB_NAMEDCLASS;
17053     char *rangebegin = NULL;
17054     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
17055                                aren't available at the time this was called */
17056     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17057                                       than just initialized.  */
17058     SV* properties = NULL;    /* Code points that match \p{} \P{} */
17059     SV* posixes = NULL;     /* Code points that match classes like [:word:],
17060                                extended beyond the Latin1 range.  These have to
17061                                be kept separate from other code points for much
17062                                of this function because their handling  is
17063                                different under /i, and for most classes under
17064                                /d as well */
17065     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
17066                                separate for a while from the non-complemented
17067                                versions because of complications with /d
17068                                matching */
17069     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17070                                   treated more simply than the general case,
17071                                   leading to less compilation and execution
17072                                   work */
17073     UV element_count = 0;   /* Number of distinct elements in the class.
17074                                Optimizations may be possible if this is tiny */
17075     AV * multi_char_matches = NULL; /* Code points that fold to more than one
17076                                        character; used under /i */
17077     UV n;
17078     char * stop_ptr = RExC_end;    /* where to stop parsing */
17079
17080     /* ignore unescaped whitespace? */
17081     const bool skip_white = cBOOL(   ret_invlist
17082                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17083
17084     /* inversion list of code points this node matches only when the target
17085      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
17086      * /d) */
17087     SV* upper_latin1_only_utf8_matches = NULL;
17088
17089     /* Inversion list of code points this node matches regardless of things
17090      * like locale, folding, utf8ness of the target string */
17091     SV* cp_list = NULL;
17092
17093     /* Like cp_list, but code points on this list need to be checked for things
17094      * that fold to/from them under /i */
17095     SV* cp_foldable_list = NULL;
17096
17097     /* Like cp_list, but code points on this list are valid only when the
17098      * runtime locale is UTF-8 */
17099     SV* only_utf8_locale_list = NULL;
17100
17101     /* In a range, if one of the endpoints is non-character-set portable,
17102      * meaning that it hard-codes a code point that may mean a different
17103      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17104      * mnemonic '\t' which each mean the same character no matter which
17105      * character set the platform is on. */
17106     unsigned int non_portable_endpoint = 0;
17107
17108     /* Is the range unicode? which means on a platform that isn't 1-1 native
17109      * to Unicode (i.e. non-ASCII), each code point in it should be considered
17110      * to be a Unicode value.  */
17111     bool unicode_range = FALSE;
17112     bool invert = FALSE;    /* Is this class to be complemented */
17113
17114     bool warn_super = ALWAYS_WARN_SUPER;
17115
17116     const char * orig_parse = RExC_parse;
17117
17118     /* This variable is used to mark where the end in the input is of something
17119      * that looks like a POSIX construct but isn't.  During the parse, when
17120      * something looks like it could be such a construct is encountered, it is
17121      * checked for being one, but not if we've already checked this area of the
17122      * input.  Only after this position is reached do we check again */
17123     char *not_posix_region_end = RExC_parse - 1;
17124
17125     AV* posix_warnings = NULL;
17126     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17127     U8 op = END;    /* The returned node-type, initialized to an impossible
17128                        one.  */
17129     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
17130     U32 posixl = 0;       /* bit field of posix classes matched under /l */
17131
17132
17133 /* Flags as to what things aren't knowable until runtime.  (Note that these are
17134  * mutually exclusive.) */
17135 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
17136                                             haven't been defined as of yet */
17137 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
17138                                             UTF-8 or not */
17139 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
17140                                             what gets folded */
17141     U32 has_runtime_dependency = 0;     /* OR of the above flags */
17142
17143     GET_RE_DEBUG_FLAGS_DECL;
17144
17145     PERL_ARGS_ASSERT_REGCLASS;
17146 #ifndef DEBUGGING
17147     PERL_UNUSED_ARG(depth);
17148 #endif
17149
17150
17151     /* If wants an inversion list returned, we can't optimize to something
17152      * else. */
17153     if (ret_invlist) {
17154         optimizable = FALSE;
17155     }
17156
17157     DEBUG_PARSE("clas");
17158
17159 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
17160     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
17161                                    && UNICODE_DOT_DOT_VERSION == 0)
17162     allow_mutiple_chars = FALSE;
17163 #endif
17164
17165     /* We include the /i status at the beginning of this so that we can
17166      * know it at runtime */
17167     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17168     initial_listsv_len = SvCUR(listsv);
17169     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
17170
17171     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17172
17173     assert(RExC_parse <= RExC_end);
17174
17175     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
17176         RExC_parse++;
17177         invert = TRUE;
17178         allow_mutiple_chars = FALSE;
17179         MARK_NAUGHTY(1);
17180         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17181     }
17182
17183     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17184     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17185         int maybe_class = handle_possible_posix(pRExC_state,
17186                                                 RExC_parse,
17187                                                 &not_posix_region_end,
17188                                                 NULL,
17189                                                 TRUE /* checking only */);
17190         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17191             ckWARN4reg(not_posix_region_end,
17192                     "POSIX syntax [%c %c] belongs inside character classes%s",
17193                     *RExC_parse, *RExC_parse,
17194                     (maybe_class == OOB_NAMEDCLASS)
17195                     ? ((POSIXCC_NOTYET(*RExC_parse))
17196                         ? " (but this one isn't implemented)"
17197                         : " (but this one isn't fully valid)")
17198                     : ""
17199                     );
17200         }
17201     }
17202
17203     /* If the caller wants us to just parse a single element, accomplish this
17204      * by faking the loop ending condition */
17205     if (stop_at_1 && RExC_end > RExC_parse) {
17206         stop_ptr = RExC_parse + 1;
17207     }
17208
17209     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17210     if (UCHARAT(RExC_parse) == ']')
17211         goto charclassloop;
17212
17213     while (1) {
17214
17215         if (   posix_warnings
17216             && av_tindex_skip_len_mg(posix_warnings) >= 0
17217             && RExC_parse > not_posix_region_end)
17218         {
17219             /* Warnings about posix class issues are considered tentative until
17220              * we are far enough along in the parse that we can no longer
17221              * change our mind, at which point we output them.  This is done
17222              * each time through the loop so that a later class won't zap them
17223              * before they have been dealt with. */
17224             output_posix_warnings(pRExC_state, posix_warnings);
17225         }
17226
17227         if  (RExC_parse >= stop_ptr) {
17228             break;
17229         }
17230
17231         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17232
17233         if  (UCHARAT(RExC_parse) == ']') {
17234             break;
17235         }
17236
17237       charclassloop:
17238
17239         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17240         save_value = value;
17241         save_prevvalue = prevvalue;
17242
17243         if (!range) {
17244             rangebegin = RExC_parse;
17245             element_count++;
17246             non_portable_endpoint = 0;
17247         }
17248         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17249             value = utf8n_to_uvchr((U8*)RExC_parse,
17250                                    RExC_end - RExC_parse,
17251                                    &numlen, UTF8_ALLOW_DEFAULT);
17252             RExC_parse += numlen;
17253         }
17254         else
17255             value = UCHARAT(RExC_parse++);
17256
17257         if (value == '[') {
17258             char * posix_class_end;
17259             namedclass = handle_possible_posix(pRExC_state,
17260                                                RExC_parse,
17261                                                &posix_class_end,
17262                                                do_posix_warnings ? &posix_warnings : NULL,
17263                                                FALSE    /* die if error */);
17264             if (namedclass > OOB_NAMEDCLASS) {
17265
17266                 /* If there was an earlier attempt to parse this particular
17267                  * posix class, and it failed, it was a false alarm, as this
17268                  * successful one proves */
17269                 if (   posix_warnings
17270                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17271                     && not_posix_region_end >= RExC_parse
17272                     && not_posix_region_end <= posix_class_end)
17273                 {
17274                     av_undef(posix_warnings);
17275                 }
17276
17277                 RExC_parse = posix_class_end;
17278             }
17279             else if (namedclass == OOB_NAMEDCLASS) {
17280                 not_posix_region_end = posix_class_end;
17281             }
17282             else {
17283                 namedclass = OOB_NAMEDCLASS;
17284             }
17285         }
17286         else if (   RExC_parse - 1 > not_posix_region_end
17287                  && MAYBE_POSIXCC(value))
17288         {
17289             (void) handle_possible_posix(
17290                         pRExC_state,
17291                         RExC_parse - 1,  /* -1 because parse has already been
17292                                             advanced */
17293                         &not_posix_region_end,
17294                         do_posix_warnings ? &posix_warnings : NULL,
17295                         TRUE /* checking only */);
17296         }
17297         else if (  strict && ! skip_white
17298                  && (   _generic_isCC(value, _CC_VERTSPACE)
17299                      || is_VERTWS_cp_high(value)))
17300         {
17301             vFAIL("Literal vertical space in [] is illegal except under /x");
17302         }
17303         else if (value == '\\') {
17304             /* Is a backslash; get the code point of the char after it */
17305
17306             if (RExC_parse >= RExC_end) {
17307                 vFAIL("Unmatched [");
17308             }
17309
17310             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17311                 value = utf8n_to_uvchr((U8*)RExC_parse,
17312                                    RExC_end - RExC_parse,
17313                                    &numlen, UTF8_ALLOW_DEFAULT);
17314                 RExC_parse += numlen;
17315             }
17316             else
17317                 value = UCHARAT(RExC_parse++);
17318
17319             /* Some compilers cannot handle switching on 64-bit integer
17320              * values, therefore value cannot be an UV.  Yes, this will
17321              * be a problem later if we want switch on Unicode.
17322              * A similar issue a little bit later when switching on
17323              * namedclass. --jhi */
17324
17325             /* If the \ is escaping white space when white space is being
17326              * skipped, it means that that white space is wanted literally, and
17327              * is already in 'value'.  Otherwise, need to translate the escape
17328              * into what it signifies. */
17329             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17330
17331             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
17332             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
17333             case 's':   namedclass = ANYOF_SPACE;       break;
17334             case 'S':   namedclass = ANYOF_NSPACE;      break;
17335             case 'd':   namedclass = ANYOF_DIGIT;       break;
17336             case 'D':   namedclass = ANYOF_NDIGIT;      break;
17337             case 'v':   namedclass = ANYOF_VERTWS;      break;
17338             case 'V':   namedclass = ANYOF_NVERTWS;     break;
17339             case 'h':   namedclass = ANYOF_HORIZWS;     break;
17340             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
17341             case 'N':  /* Handle \N{NAME} in class */
17342                 {
17343                     const char * const backslash_N_beg = RExC_parse - 2;
17344                     int cp_count;
17345
17346                     if (! grok_bslash_N(pRExC_state,
17347                                         NULL,      /* No regnode */
17348                                         &value,    /* Yes single value */
17349                                         &cp_count, /* Multiple code pt count */
17350                                         flagp,
17351                                         strict,
17352                                         depth)
17353                     ) {
17354
17355                         if (*flagp & NEED_UTF8)
17356                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17357
17358                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17359
17360                         if (cp_count < 0) {
17361                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17362                         }
17363                         else if (cp_count == 0) {
17364                             ckWARNreg(RExC_parse,
17365                               "Ignoring zero length \\N{} in character class");
17366                         }
17367                         else { /* cp_count > 1 */
17368                             assert(cp_count > 1);
17369                             if (! RExC_in_multi_char_class) {
17370                                 if ( ! allow_mutiple_chars
17371                                     || invert
17372                                     || range
17373                                     || *RExC_parse == '-')
17374                                 {
17375                                     if (strict) {
17376                                         RExC_parse--;
17377                                         vFAIL("\\N{} here is restricted to one character");
17378                                     }
17379                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17380                                     break; /* <value> contains the first code
17381                                               point. Drop out of the switch to
17382                                               process it */
17383                                 }
17384                                 else {
17385                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17386                                                  RExC_parse - backslash_N_beg);
17387                                     multi_char_matches
17388                                         = add_multi_match(multi_char_matches,
17389                                                           multi_char_N,
17390                                                           cp_count);
17391                                 }
17392                             }
17393                         } /* End of cp_count != 1 */
17394
17395                         /* This element should not be processed further in this
17396                          * class */
17397                         element_count--;
17398                         value = save_value;
17399                         prevvalue = save_prevvalue;
17400                         continue;   /* Back to top of loop to get next char */
17401                     }
17402
17403                     /* Here, is a single code point, and <value> contains it */
17404                     unicode_range = TRUE;   /* \N{} are Unicode */
17405                 }
17406                 break;
17407             case 'p':
17408             case 'P':
17409                 {
17410                 char *e;
17411
17412                 /* \p means they want Unicode semantics */
17413                 REQUIRE_UNI_RULES(flagp, 0);
17414
17415                 if (RExC_parse >= RExC_end)
17416                     vFAIL2("Empty \\%c", (U8)value);
17417                 if (*RExC_parse == '{') {
17418                     const U8 c = (U8)value;
17419                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17420                     if (!e) {
17421                         RExC_parse++;
17422                         vFAIL2("Missing right brace on \\%c{}", c);
17423                     }
17424
17425                     RExC_parse++;
17426
17427                     /* White space is allowed adjacent to the braces and after
17428                      * any '^', even when not under /x */
17429                     while (isSPACE(*RExC_parse)) {
17430                          RExC_parse++;
17431                     }
17432
17433                     if (UCHARAT(RExC_parse) == '^') {
17434
17435                         /* toggle.  (The rhs xor gets the single bit that
17436                          * differs between P and p; the other xor inverts just
17437                          * that bit) */
17438                         value ^= 'P' ^ 'p';
17439
17440                         RExC_parse++;
17441                         while (isSPACE(*RExC_parse)) {
17442                             RExC_parse++;
17443                         }
17444                     }
17445
17446                     if (e == RExC_parse)
17447                         vFAIL2("Empty \\%c{}", c);
17448
17449                     n = e - RExC_parse;
17450                     while (isSPACE(*(RExC_parse + n - 1)))
17451                         n--;
17452
17453                 }   /* The \p isn't immediately followed by a '{' */
17454                 else if (! isALPHA(*RExC_parse)) {
17455                     RExC_parse += (UTF)
17456                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17457                                   : 1;
17458                     vFAIL2("Character following \\%c must be '{' or a "
17459                            "single-character Unicode property name",
17460                            (U8) value);
17461                 }
17462                 else {
17463                     e = RExC_parse;
17464                     n = 1;
17465                 }
17466                 {
17467                     char* name = RExC_parse;
17468
17469                     /* Any message returned about expanding the definition */
17470                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17471
17472                     /* If set TRUE, the property is user-defined as opposed to
17473                      * official Unicode */
17474                     bool user_defined = FALSE;
17475
17476                     SV * prop_definition = parse_uniprop_string(
17477                                             name, n, UTF, FOLD,
17478                                             FALSE, /* This is compile-time */
17479
17480                                             /* We can't defer this defn when
17481                                              * the full result is required in
17482                                              * this call */
17483                                             ! cBOOL(ret_invlist),
17484
17485                                             &user_defined,
17486                                             msg,
17487                                             0 /* Base level */
17488                                            );
17489                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17490                         assert(prop_definition == NULL);
17491                         RExC_parse = e + 1;
17492                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17493                                                thing so, or else the display is
17494                                                mojibake */
17495                             RExC_utf8 = TRUE;
17496                         }
17497                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17498                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17499                                     SvCUR(msg), SvPVX(msg)));
17500                     }
17501
17502                     if (! is_invlist(prop_definition)) {
17503
17504                         /* Here, the definition isn't known, so we have gotten
17505                          * returned a string that will be evaluated if and when
17506                          * encountered at runtime.  We add it to the list of
17507                          * such properties, along with whether it should be
17508                          * complemented or not */
17509                         if (value == 'P') {
17510                             sv_catpvs(listsv, "!");
17511                         }
17512                         else {
17513                             sv_catpvs(listsv, "+");
17514                         }
17515                         sv_catsv(listsv, prop_definition);
17516
17517                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17518
17519                         /* We don't know yet what this matches, so have to flag
17520                          * it */
17521                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17522                     }
17523                     else {
17524                         assert (prop_definition && is_invlist(prop_definition));
17525
17526                         /* Here we do have the complete property definition
17527                          *
17528                          * Temporary workaround for [perl #133136].  For this
17529                          * precise input that is in the .t that is failing,
17530                          * load utf8.pm, which is what the test wants, so that
17531                          * that .t passes */
17532                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17533                                         "foo\\p{Alnum}")
17534                             && ! hv_common(GvHVn(PL_incgv),
17535                                            NULL,
17536                                            "utf8.pm", sizeof("utf8.pm") - 1,
17537                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17538                         {
17539                             require_pv("utf8.pm");
17540                         }
17541
17542                         if (! user_defined &&
17543                             /* We warn on matching an above-Unicode code point
17544                              * if the match would return true, except don't
17545                              * warn for \p{All}, which has exactly one element
17546                              * = 0 */
17547                             (_invlist_contains_cp(prop_definition, 0x110000)
17548                                 && (! (_invlist_len(prop_definition) == 1
17549                                        && *invlist_array(prop_definition) == 0))))
17550                         {
17551                             warn_super = TRUE;
17552                         }
17553
17554                         /* Invert if asking for the complement */
17555                         if (value == 'P') {
17556                             _invlist_union_complement_2nd(properties,
17557                                                           prop_definition,
17558                                                           &properties);
17559                         }
17560                         else {
17561                             _invlist_union(properties, prop_definition, &properties);
17562                         }
17563                     }
17564                 }
17565
17566                 RExC_parse = e + 1;
17567                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17568                                                 named */
17569                 }
17570                 break;
17571             case 'n':   value = '\n';                   break;
17572             case 'r':   value = '\r';                   break;
17573             case 't':   value = '\t';                   break;
17574             case 'f':   value = '\f';                   break;
17575             case 'b':   value = '\b';                   break;
17576             case 'e':   value = ESC_NATIVE;             break;
17577             case 'a':   value = '\a';                   break;
17578             case 'o':
17579                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17580                 {
17581                     const char* error_msg;
17582                     bool valid = grok_bslash_o(&RExC_parse,
17583                                                RExC_end,
17584                                                &value,
17585                                                &error_msg,
17586                                                TO_OUTPUT_WARNINGS(RExC_parse),
17587                                                strict,
17588                                                silence_non_portable,
17589                                                UTF);
17590                     if (! valid) {
17591                         vFAIL(error_msg);
17592                     }
17593                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17594                 }
17595                 non_portable_endpoint++;
17596                 break;
17597             case 'x':
17598                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17599                 {
17600                     const char* error_msg;
17601                     bool valid = grok_bslash_x(&RExC_parse,
17602                                                RExC_end,
17603                                                &value,
17604                                                &error_msg,
17605                                                TO_OUTPUT_WARNINGS(RExC_parse),
17606                                                strict,
17607                                                silence_non_portable,
17608                                                UTF);
17609                     if (! valid) {
17610                         vFAIL(error_msg);
17611                     }
17612                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17613                 }
17614                 non_portable_endpoint++;
17615                 break;
17616             case 'c':
17617                 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17618                 UPDATE_WARNINGS_LOC(RExC_parse);
17619                 RExC_parse++;
17620                 non_portable_endpoint++;
17621                 break;
17622             case '0': case '1': case '2': case '3': case '4':
17623             case '5': case '6': case '7':
17624                 {
17625                     /* Take 1-3 octal digits */
17626                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17627                     numlen = (strict) ? 4 : 3;
17628                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17629                     RExC_parse += numlen;
17630                     if (numlen != 3) {
17631                         if (strict) {
17632                             RExC_parse += (UTF)
17633                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17634                                           : 1;
17635                             vFAIL("Need exactly 3 octal digits");
17636                         }
17637                         else if (   numlen < 3 /* like \08, \178 */
17638                                  && RExC_parse < RExC_end
17639                                  && isDIGIT(*RExC_parse)
17640                                  && ckWARN(WARN_REGEXP))
17641                         {
17642                             reg_warn_non_literal_string(
17643                                  RExC_parse + 1,
17644                                  form_short_octal_warning(RExC_parse, numlen));
17645                         }
17646                     }
17647                     non_portable_endpoint++;
17648                     break;
17649                 }
17650             default:
17651                 /* Allow \_ to not give an error */
17652                 if (isWORDCHAR(value) && value != '_') {
17653                     if (strict) {
17654                         vFAIL2("Unrecognized escape \\%c in character class",
17655                                (int)value);
17656                     }
17657                     else {
17658                         ckWARN2reg(RExC_parse,
17659                             "Unrecognized escape \\%c in character class passed through",
17660                             (int)value);
17661                     }
17662                 }
17663                 break;
17664             }   /* End of switch on char following backslash */
17665         } /* end of handling backslash escape sequences */
17666
17667         /* Here, we have the current token in 'value' */
17668
17669         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17670             U8 classnum;
17671
17672             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
17673              * literal, as is the character that began the false range, i.e.
17674              * the 'a' in the examples */
17675             if (range) {
17676                 const int w = (RExC_parse >= rangebegin)
17677                                 ? RExC_parse - rangebegin
17678                                 : 0;
17679                 if (strict) {
17680                     vFAIL2utf8f(
17681                         "False [] range \"%" UTF8f "\"",
17682                         UTF8fARG(UTF, w, rangebegin));
17683                 }
17684                 else {
17685                     ckWARN2reg(RExC_parse,
17686                         "False [] range \"%" UTF8f "\"",
17687                         UTF8fARG(UTF, w, rangebegin));
17688                     cp_list = add_cp_to_invlist(cp_list, '-');
17689                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17690                                                             prevvalue);
17691                 }
17692
17693                 range = 0; /* this was not a true range */
17694                 element_count += 2; /* So counts for three values */
17695             }
17696
17697             classnum = namedclass_to_classnum(namedclass);
17698
17699             if (LOC && namedclass < ANYOF_POSIXL_MAX
17700 #ifndef HAS_ISASCII
17701                 && classnum != _CC_ASCII
17702 #endif
17703             ) {
17704                 SV* scratch_list = NULL;
17705
17706                 /* What the Posix classes (like \w, [:space:]) match isn't
17707                  * generally knowable under locale until actual match time.  A
17708                  * special node is used for these which has extra space for a
17709                  * bitmap, with a bit reserved for each named class that is to
17710                  * be matched against.  (This isn't needed for \p{} and
17711                  * pseudo-classes, as they are not affected by locale, and
17712                  * hence are dealt with separately.)  However, if a named class
17713                  * and its complement are both present, then it matches
17714                  * everything, and there is no runtime dependency.  Odd numbers
17715                  * are the complements of the next lower number, so xor works.
17716                  * (Note that something like [\w\D] should match everything,
17717                  * because \d should be a proper subset of \w.  But rather than
17718                  * trust that the locale is well behaved, we leave this to
17719                  * runtime to sort out) */
17720                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
17721                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
17722                     POSIXL_ZERO(posixl);
17723                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
17724                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
17725                     continue;   /* We could ignore the rest of the class, but
17726                                    best to parse it for any errors */
17727                 }
17728                 else { /* Here, isn't the complement of any already parsed
17729                           class */
17730                     POSIXL_SET(posixl, namedclass);
17731                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17732                     anyof_flags |= ANYOF_MATCHES_POSIXL;
17733
17734                     /* The above-Latin1 characters are not subject to locale
17735                      * rules.  Just add them to the unconditionally-matched
17736                      * list */
17737
17738                     /* Get the list of the above-Latin1 code points this
17739                      * matches */
17740                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17741                                             PL_XPosix_ptrs[classnum],
17742
17743                                             /* Odd numbers are complements,
17744                                              * like NDIGIT, NASCII, ... */
17745                                             namedclass % 2 != 0,
17746                                             &scratch_list);
17747                     /* Checking if 'cp_list' is NULL first saves an extra
17748                      * clone.  Its reference count will be decremented at the
17749                      * next union, etc, or if this is the only instance, at the
17750                      * end of the routine */
17751                     if (! cp_list) {
17752                         cp_list = scratch_list;
17753                     }
17754                     else {
17755                         _invlist_union(cp_list, scratch_list, &cp_list);
17756                         SvREFCNT_dec_NN(scratch_list);
17757                     }
17758                     continue;   /* Go get next character */
17759                 }
17760             }
17761             else {
17762
17763                 /* Here, is not /l, or is a POSIX class for which /l doesn't
17764                  * matter (or is a Unicode property, which is skipped here). */
17765                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
17766                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17767
17768                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
17769                          * nor /l make a difference in what these match,
17770                          * therefore we just add what they match to cp_list. */
17771                         if (classnum != _CC_VERTSPACE) {
17772                             assert(   namedclass == ANYOF_HORIZWS
17773                                    || namedclass == ANYOF_NHORIZWS);
17774
17775                             /* It turns out that \h is just a synonym for
17776                              * XPosixBlank */
17777                             classnum = _CC_BLANK;
17778                         }
17779
17780                         _invlist_union_maybe_complement_2nd(
17781                                 cp_list,
17782                                 PL_XPosix_ptrs[classnum],
17783                                 namedclass % 2 != 0,    /* Complement if odd
17784                                                           (NHORIZWS, NVERTWS)
17785                                                         */
17786                                 &cp_list);
17787                     }
17788                 }
17789                 else if (   AT_LEAST_UNI_SEMANTICS
17790                          || classnum == _CC_ASCII
17791                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
17792                                                    || classnum == _CC_XDIGIT)))
17793                 {
17794                     /* We usually have to worry about /d affecting what POSIX
17795                      * classes match, with special code needed because we won't
17796                      * know until runtime what all matches.  But there is no
17797                      * extra work needed under /u and /a; and [:ascii:] is
17798                      * unaffected by /d; and :digit: and :xdigit: don't have
17799                      * runtime differences under /d.  So we can special case
17800                      * these, and avoid some extra work below, and at runtime.
17801                      * */
17802                     _invlist_union_maybe_complement_2nd(
17803                                                      simple_posixes,
17804                                                       ((AT_LEAST_ASCII_RESTRICTED)
17805                                                        ? PL_Posix_ptrs[classnum]
17806                                                        : PL_XPosix_ptrs[classnum]),
17807                                                      namedclass % 2 != 0,
17808                                                      &simple_posixes);
17809                 }
17810                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
17811                            complement and use nposixes */
17812                     SV** posixes_ptr = namedclass % 2 == 0
17813                                        ? &posixes
17814                                        : &nposixes;
17815                     _invlist_union_maybe_complement_2nd(
17816                                                      *posixes_ptr,
17817                                                      PL_XPosix_ptrs[classnum],
17818                                                      namedclass % 2 != 0,
17819                                                      posixes_ptr);
17820                 }
17821             }
17822         } /* end of namedclass \blah */
17823
17824         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17825
17826         /* If 'range' is set, 'value' is the ending of a range--check its
17827          * validity.  (If value isn't a single code point in the case of a
17828          * range, we should have figured that out above in the code that
17829          * catches false ranges).  Later, we will handle each individual code
17830          * point in the range.  If 'range' isn't set, this could be the
17831          * beginning of a range, so check for that by looking ahead to see if
17832          * the next real character to be processed is the range indicator--the
17833          * minus sign */
17834
17835         if (range) {
17836 #ifdef EBCDIC
17837             /* For unicode ranges, we have to test that the Unicode as opposed
17838              * to the native values are not decreasing.  (Above 255, there is
17839              * no difference between native and Unicode) */
17840             if (unicode_range && prevvalue < 255 && value < 255) {
17841                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17842                     goto backwards_range;
17843                 }
17844             }
17845             else
17846 #endif
17847             if (prevvalue > value) /* b-a */ {
17848                 int w;
17849 #ifdef EBCDIC
17850               backwards_range:
17851 #endif
17852                 w = RExC_parse - rangebegin;
17853                 vFAIL2utf8f(
17854                     "Invalid [] range \"%" UTF8f "\"",
17855                     UTF8fARG(UTF, w, rangebegin));
17856                 NOT_REACHED; /* NOTREACHED */
17857             }
17858         }
17859         else {
17860             prevvalue = value; /* save the beginning of the potential range */
17861             if (! stop_at_1     /* Can't be a range if parsing just one thing */
17862                 && *RExC_parse == '-')
17863             {
17864                 char* next_char_ptr = RExC_parse + 1;
17865
17866                 /* Get the next real char after the '-' */
17867                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17868
17869                 /* If the '-' is at the end of the class (just before the ']',
17870                  * it is a literal minus; otherwise it is a range */
17871                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17872                     RExC_parse = next_char_ptr;
17873
17874                     /* a bad range like \w-, [:word:]- ? */
17875                     if (namedclass > OOB_NAMEDCLASS) {
17876                         if (strict || ckWARN(WARN_REGEXP)) {
17877                             const int w = RExC_parse >= rangebegin
17878                                           ?  RExC_parse - rangebegin
17879                                           : 0;
17880                             if (strict) {
17881                                 vFAIL4("False [] range \"%*.*s\"",
17882                                     w, w, rangebegin);
17883                             }
17884                             else {
17885                                 vWARN4(RExC_parse,
17886                                     "False [] range \"%*.*s\"",
17887                                     w, w, rangebegin);
17888                             }
17889                         }
17890                         cp_list = add_cp_to_invlist(cp_list, '-');
17891                         element_count++;
17892                     } else
17893                         range = 1;      /* yeah, it's a range! */
17894                     continue;   /* but do it the next time */
17895                 }
17896             }
17897         }
17898
17899         if (namedclass > OOB_NAMEDCLASS) {
17900             continue;
17901         }
17902
17903         /* Here, we have a single value this time through the loop, and
17904          * <prevvalue> is the beginning of the range, if any; or <value> if
17905          * not. */
17906
17907         /* non-Latin1 code point implies unicode semantics. */
17908         if (value > 255) {
17909             REQUIRE_UNI_RULES(flagp, 0);
17910         }
17911
17912         /* Ready to process either the single value, or the completed range.
17913          * For single-valued non-inverted ranges, we consider the possibility
17914          * of multi-char folds.  (We made a conscious decision to not do this
17915          * for the other cases because it can often lead to non-intuitive
17916          * results.  For example, you have the peculiar case that:
17917          *  "s s" =~ /^[^\xDF]+$/i => Y
17918          *  "ss"  =~ /^[^\xDF]+$/i => N
17919          *
17920          * See [perl #89750] */
17921         if (FOLD && allow_mutiple_chars && value == prevvalue) {
17922             if (    value == LATIN_SMALL_LETTER_SHARP_S
17923                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17924                                                         value)))
17925             {
17926                 /* Here <value> is indeed a multi-char fold.  Get what it is */
17927
17928                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17929                 STRLEN foldlen;
17930
17931                 UV folded = _to_uni_fold_flags(
17932                                 value,
17933                                 foldbuf,
17934                                 &foldlen,
17935                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17936                                                    ? FOLD_FLAGS_NOMIX_ASCII
17937                                                    : 0)
17938                                 );
17939
17940                 /* Here, <folded> should be the first character of the
17941                  * multi-char fold of <value>, with <foldbuf> containing the
17942                  * whole thing.  But, if this fold is not allowed (because of
17943                  * the flags), <fold> will be the same as <value>, and should
17944                  * be processed like any other character, so skip the special
17945                  * handling */
17946                 if (folded != value) {
17947
17948                     /* Skip if we are recursed, currently parsing the class
17949                      * again.  Otherwise add this character to the list of
17950                      * multi-char folds. */
17951                     if (! RExC_in_multi_char_class) {
17952                         STRLEN cp_count = utf8_length(foldbuf,
17953                                                       foldbuf + foldlen);
17954                         SV* multi_fold = sv_2mortal(newSVpvs(""));
17955
17956                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17957
17958                         multi_char_matches
17959                                         = add_multi_match(multi_char_matches,
17960                                                           multi_fold,
17961                                                           cp_count);
17962
17963                     }
17964
17965                     /* This element should not be processed further in this
17966                      * class */
17967                     element_count--;
17968                     value = save_value;
17969                     prevvalue = save_prevvalue;
17970                     continue;
17971                 }
17972             }
17973         }
17974
17975         if (strict && ckWARN(WARN_REGEXP)) {
17976             if (range) {
17977
17978                 /* If the range starts above 255, everything is portable and
17979                  * likely to be so for any forseeable character set, so don't
17980                  * warn. */
17981                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17982                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17983                 }
17984                 else if (prevvalue != value) {
17985
17986                     /* Under strict, ranges that stop and/or end in an ASCII
17987                      * printable should have each end point be a portable value
17988                      * for it (preferably like 'A', but we don't warn if it is
17989                      * a (portable) Unicode name or code point), and the range
17990                      * must be be all digits or all letters of the same case.
17991                      * Otherwise, the range is non-portable and unclear as to
17992                      * what it contains */
17993                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
17994                         && (          non_portable_endpoint
17995                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17996                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
17997                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
17998                     ))) {
17999                         vWARN(RExC_parse, "Ranges of ASCII printables should"
18000                                           " be some subset of \"0-9\","
18001                                           " \"A-Z\", or \"a-z\"");
18002                     }
18003                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18004                         SSize_t index_start;
18005                         SSize_t index_final;
18006
18007                         /* But the nature of Unicode and languages mean we
18008                          * can't do the same checks for above-ASCII ranges,
18009                          * except in the case of digit ones.  These should
18010                          * contain only digits from the same group of 10.  The
18011                          * ASCII case is handled just above.  Hence here, the
18012                          * range could be a range of digits.  First some
18013                          * unlikely special cases.  Grandfather in that a range
18014                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18015                          * if its starting value is one of the 10 digits prior
18016                          * to it.  This is because it is an alternate way of
18017                          * writing 19D1, and some people may expect it to be in
18018                          * that group.  But it is bad, because it won't give
18019                          * the expected results.  In Unicode 5.2 it was
18020                          * considered to be in that group (of 11, hence), but
18021                          * this was fixed in the next version */
18022
18023                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18024                             goto warn_bad_digit_range;
18025                         }
18026                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
18027                                           &&     value <= 0x1D7FF))
18028                         {
18029                             /* This is the only other case currently in Unicode
18030                              * where the algorithm below fails.  The code
18031                              * points just above are the end points of a single
18032                              * range containing only decimal digits.  It is 5
18033                              * different series of 0-9.  All other ranges of
18034                              * digits currently in Unicode are just a single
18035                              * series.  (And mktables will notify us if a later
18036                              * Unicode version breaks this.)
18037                              *
18038                              * If the range being checked is at most 9 long,
18039                              * and the digit values represented are in
18040                              * numerical order, they are from the same series.
18041                              * */
18042                             if (         value - prevvalue > 9
18043                                 ||    (((    value - 0x1D7CE) % 10)
18044                                      <= (prevvalue - 0x1D7CE) % 10))
18045                             {
18046                                 goto warn_bad_digit_range;
18047                             }
18048                         }
18049                         else {
18050
18051                             /* For all other ranges of digits in Unicode, the
18052                              * algorithm is just to check if both end points
18053                              * are in the same series, which is the same range.
18054                              * */
18055                             index_start = _invlist_search(
18056                                                     PL_XPosix_ptrs[_CC_DIGIT],
18057                                                     prevvalue);
18058
18059                             /* Warn if the range starts and ends with a digit,
18060                              * and they are not in the same group of 10. */
18061                             if (   index_start >= 0
18062                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18063                                 && (index_final =
18064                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18065                                                     value)) != index_start
18066                                 && index_final >= 0
18067                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18068                             {
18069                               warn_bad_digit_range:
18070                                 vWARN(RExC_parse, "Ranges of digits should be"
18071                                                   " from the same group of"
18072                                                   " 10");
18073                             }
18074                         }
18075                     }
18076                 }
18077             }
18078             if ((! range || prevvalue == value) && non_portable_endpoint) {
18079                 if (isPRINT_A(value)) {
18080                     char literal[3];
18081                     unsigned d = 0;
18082                     if (isBACKSLASHED_PUNCT(value)) {
18083                         literal[d++] = '\\';
18084                     }
18085                     literal[d++] = (char) value;
18086                     literal[d++] = '\0';
18087
18088                     vWARN4(RExC_parse,
18089                            "\"%.*s\" is more clearly written simply as \"%s\"",
18090                            (int) (RExC_parse - rangebegin),
18091                            rangebegin,
18092                            literal
18093                         );
18094                 }
18095                 else if (isMNEMONIC_CNTRL(value)) {
18096                     vWARN4(RExC_parse,
18097                            "\"%.*s\" is more clearly written simply as \"%s\"",
18098                            (int) (RExC_parse - rangebegin),
18099                            rangebegin,
18100                            cntrl_to_mnemonic((U8) value)
18101                         );
18102                 }
18103             }
18104         }
18105
18106         /* Deal with this element of the class */
18107
18108 #ifndef EBCDIC
18109         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18110                                                     prevvalue, value);
18111 #else
18112         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18113          * that don't require special handling, we can just add the range like
18114          * we do for ASCII platforms */
18115         if ((UNLIKELY(prevvalue == 0) && value >= 255)
18116             || ! (prevvalue < 256
18117                     && (unicode_range
18118                         || (! non_portable_endpoint
18119                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18120                                 || (isUPPER_A(prevvalue)
18121                                     && isUPPER_A(value)))))))
18122         {
18123             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18124                                                         prevvalue, value);
18125         }
18126         else {
18127             /* Here, requires special handling.  This can be because it is a
18128              * range whose code points are considered to be Unicode, and so
18129              * must be individually translated into native, or because its a
18130              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18131              * EBCDIC, but we have defined them to include only the "expected"
18132              * upper or lower case ASCII alphabetics.  Subranges above 255 are
18133              * the same in native and Unicode, so can be added as a range */
18134             U8 start = NATIVE_TO_LATIN1(prevvalue);
18135             unsigned j;
18136             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18137             for (j = start; j <= end; j++) {
18138                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18139             }
18140             if (value > 255) {
18141                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18142                                                             256, value);
18143             }
18144         }
18145 #endif
18146
18147         range = 0; /* this range (if it was one) is done now */
18148     } /* End of loop through all the text within the brackets */
18149
18150     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18151         output_posix_warnings(pRExC_state, posix_warnings);
18152     }
18153
18154     /* If anything in the class expands to more than one character, we have to
18155      * deal with them by building up a substitute parse string, and recursively
18156      * calling reg() on it, instead of proceeding */
18157     if (multi_char_matches) {
18158         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18159         I32 cp_count;
18160         STRLEN len;
18161         char *save_end = RExC_end;
18162         char *save_parse = RExC_parse;
18163         char *save_start = RExC_start;
18164         Size_t constructed_prefix_len = 0; /* This gives the length of the
18165                                               constructed portion of the
18166                                               substitute parse. */
18167         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
18168                                        a "|" */
18169         I32 reg_flags;
18170
18171         assert(! invert);
18172         /* Only one level of recursion allowed */
18173         assert(RExC_copy_start_in_constructed == RExC_precomp);
18174
18175 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
18176            because too confusing */
18177         if (invert) {
18178             sv_catpvs(substitute_parse, "(?:");
18179         }
18180 #endif
18181
18182         /* Look at the longest folds first */
18183         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18184                         cp_count > 0;
18185                         cp_count--)
18186         {
18187
18188             if (av_exists(multi_char_matches, cp_count)) {
18189                 AV** this_array_ptr;
18190                 SV* this_sequence;
18191
18192                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18193                                                  cp_count, FALSE);
18194                 while ((this_sequence = av_pop(*this_array_ptr)) !=
18195                                                                 &PL_sv_undef)
18196                 {
18197                     if (! first_time) {
18198                         sv_catpvs(substitute_parse, "|");
18199                     }
18200                     first_time = FALSE;
18201
18202                     sv_catpv(substitute_parse, SvPVX(this_sequence));
18203                 }
18204             }
18205         }
18206
18207         /* If the character class contains anything else besides these
18208          * multi-character folds, have to include it in recursive parsing */
18209         if (element_count) {
18210             sv_catpvs(substitute_parse, "|[");
18211             constructed_prefix_len = SvCUR(substitute_parse);
18212             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18213
18214             /* Put in a closing ']' only if not going off the end, as otherwise
18215              * we are adding something that really isn't there */
18216             if (RExC_parse < RExC_end) {
18217                 sv_catpvs(substitute_parse, "]");
18218             }
18219         }
18220
18221         sv_catpvs(substitute_parse, ")");
18222 #if 0
18223         if (invert) {
18224             /* This is a way to get the parse to skip forward a whole named
18225              * sequence instead of matching the 2nd character when it fails the
18226              * first */
18227             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18228         }
18229 #endif
18230
18231         /* Set up the data structure so that any errors will be properly
18232          * reported.  See the comments at the definition of
18233          * REPORT_LOCATION_ARGS for details */
18234         RExC_copy_start_in_input = (char *) orig_parse;
18235         RExC_start = RExC_parse = SvPV(substitute_parse, len);
18236         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18237         RExC_end = RExC_parse + len;
18238         RExC_in_multi_char_class = 1;
18239
18240         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18241
18242         *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
18243
18244         /* And restore so can parse the rest of the pattern */
18245         RExC_parse = save_parse;
18246         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18247         RExC_end = save_end;
18248         RExC_in_multi_char_class = 0;
18249         SvREFCNT_dec_NN(multi_char_matches);
18250         return ret;
18251     }
18252
18253     /* If folding, we calculate all characters that could fold to or from the
18254      * ones already on the list */
18255     if (cp_foldable_list) {
18256         if (FOLD) {
18257             UV start, end;      /* End points of code point ranges */
18258
18259             SV* fold_intersection = NULL;
18260             SV** use_list;
18261
18262             /* Our calculated list will be for Unicode rules.  For locale
18263              * matching, we have to keep a separate list that is consulted at
18264              * runtime only when the locale indicates Unicode rules (and we
18265              * don't include potential matches in the ASCII/Latin1 range, as
18266              * any code point could fold to any other, based on the run-time
18267              * locale).   For non-locale, we just use the general list */
18268             if (LOC) {
18269                 use_list = &only_utf8_locale_list;
18270             }
18271             else {
18272                 use_list = &cp_list;
18273             }
18274
18275             /* Only the characters in this class that participate in folds need
18276              * be checked.  Get the intersection of this class and all the
18277              * possible characters that are foldable.  This can quickly narrow
18278              * down a large class */
18279             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18280                                   &fold_intersection);
18281
18282             /* Now look at the foldable characters in this class individually */
18283             invlist_iterinit(fold_intersection);
18284             while (invlist_iternext(fold_intersection, &start, &end)) {
18285                 UV j;
18286                 UV folded;
18287
18288                 /* Look at every character in the range */
18289                 for (j = start; j <= end; j++) {
18290                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18291                     STRLEN foldlen;
18292                     unsigned int k;
18293                     Size_t folds_count;
18294                     unsigned int first_fold;
18295                     const unsigned int * remaining_folds;
18296
18297                     if (j < 256) {
18298
18299                         /* Under /l, we don't know what code points below 256
18300                          * fold to, except we do know the MICRO SIGN folds to
18301                          * an above-255 character if the locale is UTF-8, so we
18302                          * add it to the special list (in *use_list)  Otherwise
18303                          * we know now what things can match, though some folds
18304                          * are valid under /d only if the target is UTF-8.
18305                          * Those go in a separate list */
18306                         if (      IS_IN_SOME_FOLD_L1(j)
18307                             && ! (LOC && j != MICRO_SIGN))
18308                         {
18309
18310                             /* ASCII is always matched; non-ASCII is matched
18311                              * only under Unicode rules (which could happen
18312                              * under /l if the locale is a UTF-8 one */
18313                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18314                                 *use_list = add_cp_to_invlist(*use_list,
18315                                                             PL_fold_latin1[j]);
18316                             }
18317                             else if (j != PL_fold_latin1[j]) {
18318                                 upper_latin1_only_utf8_matches
18319                                         = add_cp_to_invlist(
18320                                                 upper_latin1_only_utf8_matches,
18321                                                 PL_fold_latin1[j]);
18322                             }
18323                         }
18324
18325                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18326                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18327                         {
18328                             add_above_Latin1_folds(pRExC_state,
18329                                                    (U8) j,
18330                                                    use_list);
18331                         }
18332                         continue;
18333                     }
18334
18335                     /* Here is an above Latin1 character.  We don't have the
18336                      * rules hard-coded for it.  First, get its fold.  This is
18337                      * the simple fold, as the multi-character folds have been
18338                      * handled earlier and separated out */
18339                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18340                                                         (ASCII_FOLD_RESTRICTED)
18341                                                         ? FOLD_FLAGS_NOMIX_ASCII
18342                                                         : 0);
18343
18344                     /* Single character fold of above Latin1.  Add everything
18345                      * in its fold closure to the list that this node should
18346                      * match. */
18347                     folds_count = _inverse_folds(folded, &first_fold,
18348                                                     &remaining_folds);
18349                     for (k = 0; k <= folds_count; k++) {
18350                         UV c = (k == 0)     /* First time through use itself */
18351                                 ? folded
18352                                 : (k == 1)  /* 2nd time use, the first fold */
18353                                    ? first_fold
18354
18355                                      /* Then the remaining ones */
18356                                    : remaining_folds[k-2];
18357
18358                         /* /aa doesn't allow folds between ASCII and non- */
18359                         if ((   ASCII_FOLD_RESTRICTED
18360                             && (isASCII(c) != isASCII(j))))
18361                         {
18362                             continue;
18363                         }
18364
18365                         /* Folds under /l which cross the 255/256 boundary are
18366                          * added to a separate list.  (These are valid only
18367                          * when the locale is UTF-8.) */
18368                         if (c < 256 && LOC) {
18369                             *use_list = add_cp_to_invlist(*use_list, c);
18370                             continue;
18371                         }
18372
18373                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18374                         {
18375                             cp_list = add_cp_to_invlist(cp_list, c);
18376                         }
18377                         else {
18378                             /* Similarly folds involving non-ascii Latin1
18379                              * characters under /d are added to their list */
18380                             upper_latin1_only_utf8_matches
18381                                     = add_cp_to_invlist(
18382                                                 upper_latin1_only_utf8_matches,
18383                                                 c);
18384                         }
18385                     }
18386                 }
18387             }
18388             SvREFCNT_dec_NN(fold_intersection);
18389         }
18390
18391         /* Now that we have finished adding all the folds, there is no reason
18392          * to keep the foldable list separate */
18393         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18394         SvREFCNT_dec_NN(cp_foldable_list);
18395     }
18396
18397     /* And combine the result (if any) with any inversion lists from posix
18398      * classes.  The lists are kept separate up to now because we don't want to
18399      * fold the classes */
18400     if (simple_posixes) {   /* These are the classes known to be unaffected by
18401                                /a, /aa, and /d */
18402         if (cp_list) {
18403             _invlist_union(cp_list, simple_posixes, &cp_list);
18404             SvREFCNT_dec_NN(simple_posixes);
18405         }
18406         else {
18407             cp_list = simple_posixes;
18408         }
18409     }
18410     if (posixes || nposixes) {
18411         if (! DEPENDS_SEMANTICS) {
18412
18413             /* For everything but /d, we can just add the current 'posixes' and
18414              * 'nposixes' to the main list */
18415             if (posixes) {
18416                 if (cp_list) {
18417                     _invlist_union(cp_list, posixes, &cp_list);
18418                     SvREFCNT_dec_NN(posixes);
18419                 }
18420                 else {
18421                     cp_list = posixes;
18422                 }
18423             }
18424             if (nposixes) {
18425                 if (cp_list) {
18426                     _invlist_union(cp_list, nposixes, &cp_list);
18427                     SvREFCNT_dec_NN(nposixes);
18428                 }
18429                 else {
18430                     cp_list = nposixes;
18431                 }
18432             }
18433         }
18434         else {
18435             /* Under /d, things like \w match upper Latin1 characters only if
18436              * the target string is in UTF-8.  But things like \W match all the
18437              * upper Latin1 characters if the target string is not in UTF-8.
18438              *
18439              * Handle the case with something like \W separately */
18440             if (nposixes) {
18441                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18442
18443                 /* A complemented posix class matches all upper Latin1
18444                  * characters if not in UTF-8.  And it matches just certain
18445                  * ones when in UTF-8.  That means those certain ones are
18446                  * matched regardless, so can just be added to the
18447                  * unconditional list */
18448                 if (cp_list) {
18449                     _invlist_union(cp_list, nposixes, &cp_list);
18450                     SvREFCNT_dec_NN(nposixes);
18451                     nposixes = NULL;
18452                 }
18453                 else {
18454                     cp_list = nposixes;
18455                 }
18456
18457                 /* Likewise for 'posixes' */
18458                 _invlist_union(posixes, cp_list, &cp_list);
18459                 SvREFCNT_dec(posixes);
18460
18461                 /* Likewise for anything else in the range that matched only
18462                  * under UTF-8 */
18463                 if (upper_latin1_only_utf8_matches) {
18464                     _invlist_union(cp_list,
18465                                    upper_latin1_only_utf8_matches,
18466                                    &cp_list);
18467                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18468                     upper_latin1_only_utf8_matches = NULL;
18469                 }
18470
18471                 /* If we don't match all the upper Latin1 characters regardless
18472                  * of UTF-8ness, we have to set a flag to match the rest when
18473                  * not in UTF-8 */
18474                 _invlist_subtract(only_non_utf8_list, cp_list,
18475                                   &only_non_utf8_list);
18476                 if (_invlist_len(only_non_utf8_list) != 0) {
18477                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18478                 }
18479                 SvREFCNT_dec_NN(only_non_utf8_list);
18480             }
18481             else {
18482                 /* Here there were no complemented posix classes.  That means
18483                  * the upper Latin1 characters in 'posixes' match only when the
18484                  * target string is in UTF-8.  So we have to add them to the
18485                  * list of those types of code points, while adding the
18486                  * remainder to the unconditional list.
18487                  *
18488                  * First calculate what they are */
18489                 SV* nonascii_but_latin1_properties = NULL;
18490                 _invlist_intersection(posixes, PL_UpperLatin1,
18491                                       &nonascii_but_latin1_properties);
18492
18493                 /* And add them to the final list of such characters. */
18494                 _invlist_union(upper_latin1_only_utf8_matches,
18495                                nonascii_but_latin1_properties,
18496                                &upper_latin1_only_utf8_matches);
18497
18498                 /* Remove them from what now becomes the unconditional list */
18499                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18500                                   &posixes);
18501
18502                 /* And add those unconditional ones to the final list */
18503                 if (cp_list) {
18504                     _invlist_union(cp_list, posixes, &cp_list);
18505                     SvREFCNT_dec_NN(posixes);
18506                     posixes = NULL;
18507                 }
18508                 else {
18509                     cp_list = posixes;
18510                 }
18511
18512                 SvREFCNT_dec(nonascii_but_latin1_properties);
18513
18514                 /* Get rid of any characters from the conditional list that we
18515                  * now know are matched unconditionally, which may make that
18516                  * list empty */
18517                 _invlist_subtract(upper_latin1_only_utf8_matches,
18518                                   cp_list,
18519                                   &upper_latin1_only_utf8_matches);
18520                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18521                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18522                     upper_latin1_only_utf8_matches = NULL;
18523                 }
18524             }
18525         }
18526     }
18527
18528     /* And combine the result (if any) with any inversion list from properties.
18529      * The lists are kept separate up to now so that we can distinguish the two
18530      * in regards to matching above-Unicode.  A run-time warning is generated
18531      * if a Unicode property is matched against a non-Unicode code point. But,
18532      * we allow user-defined properties to match anything, without any warning,
18533      * and we also suppress the warning if there is a portion of the character
18534      * class that isn't a Unicode property, and which matches above Unicode, \W
18535      * or [\x{110000}] for example.
18536      * (Note that in this case, unlike the Posix one above, there is no
18537      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18538      * forces Unicode semantics */
18539     if (properties) {
18540         if (cp_list) {
18541
18542             /* If it matters to the final outcome, see if a non-property
18543              * component of the class matches above Unicode.  If so, the
18544              * warning gets suppressed.  This is true even if just a single
18545              * such code point is specified, as, though not strictly correct if
18546              * another such code point is matched against, the fact that they
18547              * are using above-Unicode code points indicates they should know
18548              * the issues involved */
18549             if (warn_super) {
18550                 warn_super = ! (invert
18551                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18552             }
18553
18554             _invlist_union(properties, cp_list, &cp_list);
18555             SvREFCNT_dec_NN(properties);
18556         }
18557         else {
18558             cp_list = properties;
18559         }
18560
18561         if (warn_super) {
18562             anyof_flags
18563              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18564
18565             /* Because an ANYOF node is the only one that warns, this node
18566              * can't be optimized into something else */
18567             optimizable = FALSE;
18568         }
18569     }
18570
18571     /* Here, we have calculated what code points should be in the character
18572      * class.
18573      *
18574      * Now we can see about various optimizations.  Fold calculation (which we
18575      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18576      * would invert to include K, which under /i would match k, which it
18577      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18578      * folded until runtime */
18579
18580     /* If we didn't do folding, it's because some information isn't available
18581      * until runtime; set the run-time fold flag for these  We know to set the
18582      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18583      * at least one 0-255 range code point */
18584     if (LOC && FOLD) {
18585
18586         /* Some things on the list might be unconditionally included because of
18587          * other components.  Remove them, and clean up the list if it goes to
18588          * 0 elements */
18589         if (only_utf8_locale_list && cp_list) {
18590             _invlist_subtract(only_utf8_locale_list, cp_list,
18591                               &only_utf8_locale_list);
18592
18593             if (_invlist_len(only_utf8_locale_list) == 0) {
18594                 SvREFCNT_dec_NN(only_utf8_locale_list);
18595                 only_utf8_locale_list = NULL;
18596             }
18597         }
18598         if (    only_utf8_locale_list
18599             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18600                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18601         {
18602             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18603             anyof_flags
18604                  |= ANYOFL_FOLD
18605                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18606         }
18607         else if (cp_list && invlist_lowest(cp_list) < 256) {
18608             /* If nothing is below 256, has no locale dependency; otherwise it
18609              * does */
18610             anyof_flags |= ANYOFL_FOLD;
18611             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18612         }
18613     }
18614     else if (   DEPENDS_SEMANTICS
18615              && (    upper_latin1_only_utf8_matches
18616                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18617     {
18618         RExC_seen_d_op = TRUE;
18619         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18620     }
18621
18622     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18623      * compile time. */
18624     if (     cp_list
18625         &&   invert
18626         && ! has_runtime_dependency)
18627     {
18628         _invlist_invert(cp_list);
18629
18630         /* Clear the invert flag since have just done it here */
18631         invert = FALSE;
18632     }
18633
18634     if (ret_invlist) {
18635         *ret_invlist = cp_list;
18636
18637         return RExC_emit;
18638     }
18639
18640     /* All possible optimizations below still have these characteristics.
18641      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18642      * routine) */
18643     *flagp |= HASWIDTH|SIMPLE;
18644
18645     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18646         RExC_contains_locale = 1;
18647     }
18648
18649     /* Some character classes are equivalent to other nodes.  Such nodes take
18650      * up less room, and some nodes require fewer operations to execute, than
18651      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
18652      * improve efficiency. */
18653
18654     if (optimizable) {
18655         PERL_UINT_FAST8_T i;
18656         UV partial_cp_count = 0;
18657         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18658         UV   end[MAX_FOLD_FROMS+1] = { 0 };
18659         bool single_range = FALSE;
18660
18661         if (cp_list) { /* Count the code points in enough ranges that we would
18662                           see all the ones possible in any fold in this version
18663                           of Unicode */
18664
18665             invlist_iterinit(cp_list);
18666             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18667                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
18668                     break;
18669                 }
18670                 partial_cp_count += end[i] - start[i] + 1;
18671             }
18672
18673             if (i == 1) {
18674                 single_range = TRUE;
18675             }
18676             invlist_iterfinish(cp_list);
18677         }
18678
18679         /* If we know at compile time that this matches every possible code
18680          * point, any run-time dependencies don't matter */
18681         if (start[0] == 0 && end[0] == UV_MAX) {
18682             if (invert) {
18683                 ret = reganode(pRExC_state, OPFAIL, 0);
18684             }
18685             else {
18686                 ret = reg_node(pRExC_state, SANY);
18687                 MARK_NAUGHTY(1);
18688             }
18689             goto not_anyof;
18690         }
18691
18692         /* Similarly, for /l posix classes, if both a class and its
18693          * complement match, any run-time dependencies don't matter */
18694         if (posixl) {
18695             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18696                                                         namedclass += 2)
18697             {
18698                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
18699                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18700                 {
18701                     if (invert) {
18702                         ret = reganode(pRExC_state, OPFAIL, 0);
18703                     }
18704                     else {
18705                         ret = reg_node(pRExC_state, SANY);
18706                         MARK_NAUGHTY(1);
18707                     }
18708                     goto not_anyof;
18709                 }
18710             }
18711
18712             /* For well-behaved locales, some classes are subsets of others,
18713              * so complementing the subset and including the non-complemented
18714              * superset should match everything, like [\D[:alnum:]], and
18715              * [[:^alpha:][:alnum:]], but some implementations of locales are
18716              * buggy, and khw thinks its a bad idea to have optimization change
18717              * behavior, even if it avoids an OS bug in a given case */
18718
18719 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18720
18721             /* If is a single posix /l class, can optimize to just that op.
18722              * Such a node will not match anything in the Latin1 range, as that
18723              * is not determinable until runtime, but will match whatever the
18724              * class does outside that range.  (Note that some classes won't
18725              * match anything outside the range, like [:ascii:]) */
18726             if (    isSINGLE_BIT_SET(posixl)
18727                 && (partial_cp_count == 0 || start[0] > 255))
18728             {
18729                 U8 classnum;
18730                 SV * class_above_latin1 = NULL;
18731                 bool already_inverted;
18732                 bool are_equivalent;
18733
18734                 /* Compute which bit is set, which is the same thing as, e.g.,
18735                  * ANYOF_CNTRL.  From
18736                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18737                  * */
18738                 static const int MultiplyDeBruijnBitPosition2[32] =
18739                     {
18740                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18741                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18742                     };
18743
18744                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18745                                                           * 0x077CB531U) >> 27];
18746                 classnum = namedclass_to_classnum(namedclass);
18747
18748                 /* The named classes are such that the inverted number is one
18749                  * larger than the non-inverted one */
18750                 already_inverted = namedclass
18751                                  - classnum_to_namedclass(classnum);
18752
18753                 /* Create an inversion list of the official property, inverted
18754                  * if the constructed node list is inverted, and restricted to
18755                  * only the above latin1 code points, which are the only ones
18756                  * known at compile time */
18757                 _invlist_intersection_maybe_complement_2nd(
18758                                                     PL_AboveLatin1,
18759                                                     PL_XPosix_ptrs[classnum],
18760                                                     already_inverted,
18761                                                     &class_above_latin1);
18762                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18763                                                                         FALSE);
18764                 SvREFCNT_dec_NN(class_above_latin1);
18765
18766                 if (are_equivalent) {
18767
18768                     /* Resolve the run-time inversion flag with this possibly
18769                      * inverted class */
18770                     invert = invert ^ already_inverted;
18771
18772                     ret = reg_node(pRExC_state,
18773                                    POSIXL + invert * (NPOSIXL - POSIXL));
18774                     FLAGS(REGNODE_p(ret)) = classnum;
18775                     goto not_anyof;
18776                 }
18777             }
18778         }
18779
18780         /* khw can't think of any other possible transformation involving
18781          * these. */
18782         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18783             goto is_anyof;
18784         }
18785
18786         if (! has_runtime_dependency) {
18787
18788             /* If the list is empty, nothing matches.  This happens, for
18789              * example, when a Unicode property that doesn't match anything is
18790              * the only element in the character class (perluniprops.pod notes
18791              * such properties). */
18792             if (partial_cp_count == 0) {
18793                 if (invert) {
18794                     ret = reg_node(pRExC_state, SANY);
18795                 }
18796                 else {
18797                     ret = reganode(pRExC_state, OPFAIL, 0);
18798                 }
18799
18800                 goto not_anyof;
18801             }
18802
18803             /* If matches everything but \n */
18804             if (   start[0] == 0 && end[0] == '\n' - 1
18805                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18806             {
18807                 assert (! invert);
18808                 ret = reg_node(pRExC_state, REG_ANY);
18809                 MARK_NAUGHTY(1);
18810                 goto not_anyof;
18811             }
18812         }
18813
18814         /* Next see if can optimize classes that contain just a few code points
18815          * into an EXACTish node.  The reason to do this is to let the
18816          * optimizer join this node with adjacent EXACTish ones, and ANYOF
18817          * nodes require conversion to code point from UTF-8.
18818          *
18819          * An EXACTFish node can be generated even if not under /i, and vice
18820          * versa.  But care must be taken.  An EXACTFish node has to be such
18821          * that it only matches precisely the code points in the class, but we
18822          * want to generate the least restrictive one that does that, to
18823          * increase the odds of being able to join with an adjacent node.  For
18824          * example, if the class contains [kK], we have to make it an EXACTFAA
18825          * node to prevent the KELVIN SIGN from matching.  Whether we are under
18826          * /i or not is irrelevant in this case.  Less obvious is the pattern
18827          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
18828          * supposed to match the single character U+0149 LATIN SMALL LETTER N
18829          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
18830          * that includes \X{02BC}, there is a multi-char fold that does, and so
18831          * the node generated for it must be an EXACTFish one.  On the other
18832          * hand qr/:/i should generate a plain EXACT node since the colon
18833          * participates in no fold whatsoever, and having it EXACT tells the
18834          * optimizer the target string cannot match unless it has a colon in
18835          * it.
18836          */
18837         if (   ! posixl
18838             && ! invert
18839
18840                 /* Only try if there are no more code points in the class than
18841                  * in the max possible fold */
18842             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
18843         {
18844             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18845             {
18846                 /* We can always make a single code point class into an
18847                  * EXACTish node. */
18848
18849                 if (LOC) {
18850
18851                     /* Here is /l:  Use EXACTL, except if there is a fold not
18852                      * known until runtime so shows as only a single code point
18853                      * here.  For code points above 255, we know which can
18854                      * cause problems by having a potential fold to the Latin1
18855                      * range. */
18856                     if (  ! FOLD
18857                         || (     start[0] > 255
18858                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
18859                     {
18860                         op = EXACTL;
18861                     }
18862                     else {
18863                         op = EXACTFL;
18864                     }
18865                 }
18866                 else if (! FOLD) { /* Not /l and not /i */
18867                     op = (start[0] < 256) ? EXACT : EXACT_REQ8;
18868                 }
18869                 else if (start[0] < 256) { /* /i, not /l, and the code point is
18870                                               small */
18871
18872                     /* Under /i, it gets a little tricky.  A code point that
18873                      * doesn't participate in a fold should be an EXACT node.
18874                      * We know this one isn't the result of a simple fold, or
18875                      * there'd be more than one code point in the list, but it
18876                      * could be part of a multi- character fold.  In that case
18877                      * we better not create an EXACT node, as we would wrongly
18878                      * be telling the optimizer that this code point must be in
18879                      * the target string, and that is wrong.  This is because
18880                      * if the sequence around this code point forms a
18881                      * multi-char fold, what needs to be in the string could be
18882                      * the code point that folds to the sequence.
18883                      *
18884                      * This handles the case of below-255 code points, as we
18885                      * have an easy look up for those.  The next clause handles
18886                      * the above-256 one */
18887                     op = IS_IN_SOME_FOLD_L1(start[0])
18888                          ? EXACTFU
18889                          : EXACT;
18890                 }
18891                 else {  /* /i, larger code point.  Since we are under /i, and
18892                            have just this code point, we know that it can't
18893                            fold to something else, so PL_InMultiCharFold
18894                            applies to it */
18895                     op = _invlist_contains_cp(PL_InMultiCharFold,
18896                                               start[0])
18897                          ? EXACTFU_REQ8
18898                          : EXACT_REQ8;
18899                 }
18900
18901                 value = start[0];
18902             }
18903             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18904                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
18905             {
18906                 /* Here, the only runtime dependency, if any, is from /d, and
18907                  * the class matches more than one code point, and the lowest
18908                  * code point participates in some fold.  It might be that the
18909                  * other code points are /i equivalent to this one, and hence
18910                  * they would representable by an EXACTFish node.  Above, we
18911                  * eliminated classes that contain too many code points to be
18912                  * EXACTFish, with the test for MAX_FOLD_FROMS
18913                  *
18914                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
18915                  * We do this because we have EXACTFAA at our disposal for the
18916                  * ASCII range */
18917                 if (partial_cp_count == 2 && isASCII(start[0])) {
18918
18919                     /* The only ASCII characters that participate in folds are
18920                      * alphabetics */
18921                     assert(isALPHA(start[0]));
18922                     if (   end[0] == start[0]   /* First range is a single
18923                                                    character, so 2nd exists */
18924                         && isALPHA_FOLD_EQ(start[0], start[1]))
18925                     {
18926
18927                         /* Here, is part of an ASCII fold pair */
18928
18929                         if (   ASCII_FOLD_RESTRICTED
18930                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18931                         {
18932                             /* If the second clause just above was true, it
18933                              * means we can't be under /i, or else the list
18934                              * would have included more than this fold pair.
18935                              * Therefore we have to exclude the possibility of
18936                              * whatever else it is that folds to these, by
18937                              * using EXACTFAA */
18938                             op = EXACTFAA;
18939                         }
18940                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18941
18942                             /* Here, there's no simple fold that start[0] is part
18943                              * of, but there is a multi-character one.  If we
18944                              * are not under /i, we want to exclude that
18945                              * possibility; if under /i, we want to include it
18946                              * */
18947                             op = (FOLD) ? EXACTFU : EXACTFAA;
18948                         }
18949                         else {
18950
18951                             /* Here, the only possible fold start[0] particpates in
18952                              * is with start[1].  /i or not isn't relevant */
18953                             op = EXACTFU;
18954                         }
18955
18956                         value = toFOLD(start[0]);
18957                     }
18958                 }
18959                 else if (  ! upper_latin1_only_utf8_matches
18960                          || (   _invlist_len(upper_latin1_only_utf8_matches)
18961                                                                           == 2
18962                              && PL_fold_latin1[
18963                                invlist_highest(upper_latin1_only_utf8_matches)]
18964                              == start[0]))
18965                 {
18966                     /* Here, the smallest character is non-ascii or there are
18967                      * more than 2 code points matched by this node.  Also, we
18968                      * either don't have /d UTF-8 dependent matches, or if we
18969                      * do, they look like they could be a single character that
18970                      * is the fold of the lowest one in the always-match list.
18971                      * This test quickly excludes most of the false positives
18972                      * when there are /d UTF-8 depdendent matches.  These are
18973                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18974                      * SMALL LETTER A WITH GRAVE iff the target string is
18975                      * UTF-8.  (We don't have to worry above about exceeding
18976                      * the array bounds of PL_fold_latin1[] because any code
18977                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
18978                      *
18979                      * EXACTFAA would apply only to pairs (hence exactly 2 code
18980                      * points) in the ASCII range, so we can't use it here to
18981                      * artificially restrict the fold domain, so we check if
18982                      * the class does or does not match some EXACTFish node.
18983                      * Further, if we aren't under /i, and and the folded-to
18984                      * character is part of a multi-character fold, we can't do
18985                      * this optimization, as the sequence around it could be
18986                      * that multi-character fold, and we don't here know the
18987                      * context, so we have to assume it is that multi-char
18988                      * fold, to prevent potential bugs.
18989                      *
18990                      * To do the general case, we first find the fold of the
18991                      * lowest code point (which may be higher than the lowest
18992                      * one), then find everything that folds to it.  (The data
18993                      * structure we have only maps from the folded code points,
18994                      * so we have to do the earlier step.) */
18995
18996                     Size_t foldlen;
18997                     U8 foldbuf[UTF8_MAXBYTES_CASE];
18998                     UV folded = _to_uni_fold_flags(start[0],
18999                                                         foldbuf, &foldlen, 0);
19000                     unsigned int first_fold;
19001                     const unsigned int * remaining_folds;
19002                     Size_t folds_to_this_cp_count = _inverse_folds(
19003                                                             folded,
19004                                                             &first_fold,
19005                                                             &remaining_folds);
19006                     Size_t folds_count = folds_to_this_cp_count + 1;
19007                     SV * fold_list = _new_invlist(folds_count);
19008                     unsigned int i;
19009
19010                     /* If there are UTF-8 dependent matches, create a temporary
19011                      * list of what this node matches, including them. */
19012                     SV * all_cp_list = NULL;
19013                     SV ** use_this_list = &cp_list;
19014
19015                     if (upper_latin1_only_utf8_matches) {
19016                         all_cp_list = _new_invlist(0);
19017                         use_this_list = &all_cp_list;
19018                         _invlist_union(cp_list,
19019                                        upper_latin1_only_utf8_matches,
19020                                        use_this_list);
19021                     }
19022
19023                     /* Having gotten everything that participates in the fold
19024                      * containing the lowest code point, we turn that into an
19025                      * inversion list, making sure everything is included. */
19026                     fold_list = add_cp_to_invlist(fold_list, start[0]);
19027                     fold_list = add_cp_to_invlist(fold_list, folded);
19028                     if (folds_to_this_cp_count > 0) {
19029                         fold_list = add_cp_to_invlist(fold_list, first_fold);
19030                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19031                             fold_list = add_cp_to_invlist(fold_list,
19032                                                         remaining_folds[i]);
19033                         }
19034                     }
19035
19036                     /* If the fold list is identical to what's in this ANYOF
19037                      * node, the node can be represented by an EXACTFish one
19038                      * instead */
19039                     if (_invlistEQ(*use_this_list, fold_list,
19040                                    0 /* Don't complement */ )
19041                     ) {
19042
19043                         /* But, we have to be careful, as mentioned above.
19044                          * Just the right sequence of characters could match
19045                          * this if it is part of a multi-character fold.  That
19046                          * IS what we want if we are under /i.  But it ISN'T
19047                          * what we want if not under /i, as it could match when
19048                          * it shouldn't.  So, when we aren't under /i and this
19049                          * character participates in a multi-char fold, we
19050                          * don't optimize into an EXACTFish node.  So, for each
19051                          * case below we have to check if we are folding
19052                          * and if not, if it is not part of a multi-char fold.
19053                          * */
19054                         if (start[0] > 255) {    /* Highish code point */
19055                             if (FOLD || ! _invlist_contains_cp(
19056                                             PL_InMultiCharFold, folded))
19057                             {
19058                                 op = (LOC)
19059                                      ? EXACTFLU8
19060                                      : (ASCII_FOLD_RESTRICTED)
19061                                        ? EXACTFAA
19062                                        : EXACTFU_REQ8;
19063                                 value = folded;
19064                             }
19065                         }   /* Below, the lowest code point < 256 */
19066                         else if (    FOLD
19067                                  &&  folded == 's'
19068                                  &&  DEPENDS_SEMANTICS)
19069                         {   /* An EXACTF node containing a single character
19070                                 's', can be an EXACTFU if it doesn't get
19071                                 joined with an adjacent 's' */
19072                             op = EXACTFU_S_EDGE;
19073                             value = folded;
19074                         }
19075                         else if (    FOLD
19076                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19077                         {
19078                             if (upper_latin1_only_utf8_matches) {
19079                                 op = EXACTF;
19080
19081                                 /* We can't use the fold, as that only matches
19082                                  * under UTF-8 */
19083                                 value = start[0];
19084                             }
19085                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
19086                                      && ! UTF)
19087                             {   /* EXACTFUP is a special node for this
19088                                    character */
19089                                 op = (ASCII_FOLD_RESTRICTED)
19090                                      ? EXACTFAA
19091                                      : EXACTFUP;
19092                                 value = MICRO_SIGN;
19093                             }
19094                             else if (     ASCII_FOLD_RESTRICTED
19095                                      && ! isASCII(start[0]))
19096                             {   /* For ASCII under /iaa, we can use EXACTFU
19097                                    below */
19098                                 op = EXACTFAA;
19099                                 value = folded;
19100                             }
19101                             else {
19102                                 op = EXACTFU;
19103                                 value = folded;
19104                             }
19105                         }
19106                     }
19107
19108                     SvREFCNT_dec_NN(fold_list);
19109                     SvREFCNT_dec(all_cp_list);
19110                 }
19111             }
19112
19113             if (op != END) {
19114                 U8 len;
19115
19116                 /* Here, we have calculated what EXACTish node to use.  Have to
19117                  * convert to UTF-8 if not already there */
19118                 if (value > 255) {
19119                     if (! UTF) {
19120                         SvREFCNT_dec(cp_list);;
19121                         REQUIRE_UTF8(flagp);
19122                     }
19123
19124                     /* This is a kludge to the special casing issues with this
19125                      * ligature under /aa.  FB05 should fold to FB06, but the
19126                      * call above to _to_uni_fold_flags() didn't find this, as
19127                      * it didn't use the /aa restriction in order to not miss
19128                      * other folds that would be affected.  This is the only
19129                      * instance likely to ever be a problem in all of Unicode.
19130                      * So special case it. */
19131                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
19132                         && ASCII_FOLD_RESTRICTED)
19133                     {
19134                         value = LATIN_SMALL_LIGATURE_ST;
19135                     }
19136                 }
19137
19138                 len = (UTF) ? UVCHR_SKIP(value) : 1;
19139
19140                 ret = regnode_guts(pRExC_state, op, len, "exact");
19141                 FILL_NODE(ret, op);
19142                 RExC_emit += 1 + STR_SZ(len);
19143                 setSTR_LEN(REGNODE_p(ret), len);
19144                 if (len == 1) {
19145                     *STRINGs(REGNODE_p(ret)) = (U8) value;
19146                 }
19147                 else {
19148                     uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19149                 }
19150                 goto not_anyof;
19151             }
19152         }
19153
19154         if (! has_runtime_dependency) {
19155
19156             /* See if this can be turned into an ANYOFM node.  Think about the
19157              * bit patterns in two different bytes.  In some positions, the
19158              * bits in each will be 1; and in other positions both will be 0;
19159              * and in some positions the bit will be 1 in one byte, and 0 in
19160              * the other.  Let 'n' be the number of positions where the bits
19161              * differ.  We create a mask which has exactly 'n' 0 bits, each in
19162              * a position where the two bytes differ.  Now take the set of all
19163              * bytes that when ANDed with the mask yield the same result.  That
19164              * set has 2**n elements, and is representable by just two 8 bit
19165              * numbers: the result and the mask.  Importantly, matching the set
19166              * can be vectorized by creating a word full of the result bytes,
19167              * and a word full of the mask bytes, yielding a significant speed
19168              * up.  Here, see if this node matches such a set.  As a concrete
19169              * example consider [01], and the byte representing '0' which is
19170              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
19171              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
19172              * 0x30.  Any other bytes ANDed yield something else.  So [01],
19173              * which is a common usage, is optimizable into ANYOFM, and can
19174              * benefit from the speed up.  We can only do this on UTF-8
19175              * invariant bytes, because they have the same bit patterns under
19176              * UTF-8 as not. */
19177             PERL_UINT_FAST8_T inverted = 0;
19178 #ifdef EBCDIC
19179             const PERL_UINT_FAST8_T max_permissible = 0xFF;
19180 #else
19181             const PERL_UINT_FAST8_T max_permissible = 0x7F;
19182 #endif
19183             /* If doesn't fit the criteria for ANYOFM, invert and try again.
19184              * If that works we will instead later generate an NANYOFM, and
19185              * invert back when through */
19186             if (invlist_highest(cp_list) > max_permissible) {
19187                 _invlist_invert(cp_list);
19188                 inverted = 1;
19189             }
19190
19191             if (invlist_highest(cp_list) <= max_permissible) {
19192                 UV this_start, this_end;
19193                 UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
19194                 U8 bits_differing = 0;
19195                 Size_t full_cp_count = 0;
19196                 bool first_time = TRUE;
19197
19198                 /* Go through the bytes and find the bit positions that differ
19199                  * */
19200                 invlist_iterinit(cp_list);
19201                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19202                     unsigned int i = this_start;
19203
19204                     if (first_time) {
19205                         if (! UVCHR_IS_INVARIANT(i)) {
19206                             goto done_anyofm;
19207                         }
19208
19209                         first_time = FALSE;
19210                         lowest_cp = this_start;
19211
19212                         /* We have set up the code point to compare with.
19213                          * Don't compare it with itself */
19214                         i++;
19215                     }
19216
19217                     /* Find the bit positions that differ from the lowest code
19218                      * point in the node.  Keep track of all such positions by
19219                      * OR'ing */
19220                     for (; i <= this_end; i++) {
19221                         if (! UVCHR_IS_INVARIANT(i)) {
19222                             goto done_anyofm;
19223                         }
19224
19225                         bits_differing  |= i ^ lowest_cp;
19226                     }
19227
19228                     full_cp_count += this_end - this_start + 1;
19229                 }
19230
19231                 /* At the end of the loop, we count how many bits differ from
19232                  * the bits in lowest code point, call the count 'd'.  If the
19233                  * set we found contains 2**d elements, it is the closure of
19234                  * all code points that differ only in those bit positions.  To
19235                  * convince yourself of that, first note that the number in the
19236                  * closure must be a power of 2, which we test for.  The only
19237                  * way we could have that count and it be some differing set,
19238                  * is if we got some code points that don't differ from the
19239                  * lowest code point in any position, but do differ from each
19240                  * other in some other position.  That means one code point has
19241                  * a 1 in that position, and another has a 0.  But that would
19242                  * mean that one of them differs from the lowest code point in
19243                  * that position, which possibility we've already excluded.  */
19244                 if (  (inverted || full_cp_count > 1)
19245                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19246                 {
19247                     U8 ANYOFM_mask;
19248
19249                     op = ANYOFM + inverted;;
19250
19251                     /* We need to make the bits that differ be 0's */
19252                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19253
19254                     /* The argument is the lowest code point */
19255                     ret = reganode(pRExC_state, op, lowest_cp);
19256                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19257                 }
19258
19259               done_anyofm:
19260                 invlist_iterfinish(cp_list);
19261             }
19262
19263             if (inverted) {
19264                 _invlist_invert(cp_list);
19265             }
19266
19267             if (op != END) {
19268                 goto not_anyof;
19269             }
19270
19271             /* XXX We could create an ANYOFR_LOW node here if we saved above if
19272              * all were invariants, it wasn't inverted, and there is a single
19273              * range.  This would be faster than some of the posix nodes we
19274              * create below like /\d/a, but would be twice the size.  Without
19275              * having actually measured the gain, khw doesn't think the
19276              * tradeoff is really worth it */
19277         }
19278
19279         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19280             PERL_UINT_FAST8_T type;
19281             SV * intersection = NULL;
19282             SV* d_invlist = NULL;
19283
19284             /* See if this matches any of the POSIX classes.  The POSIXA and
19285              * POSIXD ones are about the same speed as ANYOF ops, but take less
19286              * room; the ones that have above-Latin1 code point matches are
19287              * somewhat faster than ANYOF.  */
19288
19289             for (type = POSIXA; type >= POSIXD; type--) {
19290                 int posix_class;
19291
19292                 if (type == POSIXL) {   /* But not /l posix classes */
19293                     continue;
19294                 }
19295
19296                 for (posix_class = 0;
19297                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19298                      posix_class++)
19299                 {
19300                     SV** our_code_points = &cp_list;
19301                     SV** official_code_points;
19302                     int try_inverted;
19303
19304                     if (type == POSIXA) {
19305                         official_code_points = &PL_Posix_ptrs[posix_class];
19306                     }
19307                     else {
19308                         official_code_points = &PL_XPosix_ptrs[posix_class];
19309                     }
19310
19311                     /* Skip non-existent classes of this type.  e.g. \v only
19312                      * has an entry in PL_XPosix_ptrs */
19313                     if (! *official_code_points) {
19314                         continue;
19315                     }
19316
19317                     /* Try both the regular class, and its inversion */
19318                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19319                         bool this_inverted = invert ^ try_inverted;
19320
19321                         if (type != POSIXD) {
19322
19323                             /* This class that isn't /d can't match if we have
19324                              * /d dependencies */
19325                             if (has_runtime_dependency
19326                                                     & HAS_D_RUNTIME_DEPENDENCY)
19327                             {
19328                                 continue;
19329                             }
19330                         }
19331                         else /* is /d */ if (! this_inverted) {
19332
19333                             /* /d classes don't match anything non-ASCII below
19334                              * 256 unconditionally (which cp_list contains) */
19335                             _invlist_intersection(cp_list, PL_UpperLatin1,
19336                                                            &intersection);
19337                             if (_invlist_len(intersection) != 0) {
19338                                 continue;
19339                             }
19340
19341                             SvREFCNT_dec(d_invlist);
19342                             d_invlist = invlist_clone(cp_list, NULL);
19343
19344                             /* But under UTF-8 it turns into using /u rules.
19345                              * Add the things it matches under these conditions
19346                              * so that we check below that these are identical
19347                              * to what the tested class should match */
19348                             if (upper_latin1_only_utf8_matches) {
19349                                 _invlist_union(
19350                                             d_invlist,
19351                                             upper_latin1_only_utf8_matches,
19352                                             &d_invlist);
19353                             }
19354                             our_code_points = &d_invlist;
19355                         }
19356                         else {  /* POSIXD, inverted.  If this doesn't have this
19357                                    flag set, it isn't /d. */
19358                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19359                             {
19360                                 continue;
19361                             }
19362                             our_code_points = &cp_list;
19363                         }
19364
19365                         /* Here, have weeded out some things.  We want to see
19366                          * if the list of characters this node contains
19367                          * ('*our_code_points') precisely matches those of the
19368                          * class we are currently checking against
19369                          * ('*official_code_points'). */
19370                         if (_invlistEQ(*our_code_points,
19371                                        *official_code_points,
19372                                        try_inverted))
19373                         {
19374                             /* Here, they precisely match.  Optimize this ANYOF
19375                              * node into its equivalent POSIX one of the
19376                              * correct type, possibly inverted */
19377                             ret = reg_node(pRExC_state, (try_inverted)
19378                                                         ? type + NPOSIXA
19379                                                                 - POSIXA
19380                                                         : type);
19381                             FLAGS(REGNODE_p(ret)) = posix_class;
19382                             SvREFCNT_dec(d_invlist);
19383                             SvREFCNT_dec(intersection);
19384                             goto not_anyof;
19385                         }
19386                     }
19387                 }
19388             }
19389             SvREFCNT_dec(d_invlist);
19390             SvREFCNT_dec(intersection);
19391         }
19392
19393         /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19394          * both in size and speed.  Currently, a 20 bit range base (smallest
19395          * code point in the range), and a 12 bit maximum delta are packed into
19396          * a 32 bit word.  This allows for using it on all of the Unicode code
19397          * points except for the highest plane, which is only for private use
19398          * code points.  khw doubts that a bigger delta is likely in real world
19399          * applications */
19400         if (     single_range
19401             && ! has_runtime_dependency
19402             &&   anyof_flags == 0
19403             &&   start[0] < (1 << ANYOFR_BASE_BITS)
19404             &&   end[0] - start[0]
19405                     < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19406                                    * CHARBITS - ANYOFR_BASE_BITS))))
19407
19408         {
19409             U8 low_utf8[UTF8_MAXBYTES+1];
19410             U8 high_utf8[UTF8_MAXBYTES+1];
19411
19412             ret = reganode(pRExC_state, ANYOFR,
19413                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19414
19415             /* Place the lowest UTF-8 start byte in the flags field, so as to
19416              * allow efficient ruling out at run time of many possible inputs.
19417              * */
19418             (void) uvchr_to_utf8(low_utf8, start[0]);
19419             (void) uvchr_to_utf8(high_utf8, end[0]);
19420
19421             /* If all code points share the same first byte, this can be an
19422              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
19423              * quickly rule out many inputs at run-time without having to
19424              * compute the code point from UTF-8.  For EBCDIC, we use I8, as
19425              * not doing that transformation would not rule out nearly so many
19426              * things */
19427             if (low_utf8[0] == high_utf8[0]) {
19428                 OP(REGNODE_p(ret)) = ANYOFRb;
19429                 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19430             }
19431             else {
19432                 ANYOF_FLAGS(REGNODE_p(ret))
19433                                     = NATIVE_UTF8_TO_I8(low_utf8[0]);
19434             }
19435
19436             goto not_anyof;
19437         }
19438
19439         /* If didn't find an optimization and there is no need for a bitmap,
19440          * optimize to indicate that */
19441         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19442             && ! LOC
19443             && ! upper_latin1_only_utf8_matches
19444             &&   anyof_flags == 0)
19445         {
19446             U8 low_utf8[UTF8_MAXBYTES+1];
19447             UV highest_cp = invlist_highest(cp_list);
19448
19449             /* Currently the maximum allowed code point by the system is
19450              * IV_MAX.  Higher ones are reserved for future internal use.  This
19451              * particular regnode can be used for higher ones, but we can't
19452              * calculate the code point of those.  IV_MAX suffices though, as
19453              * it will be a large first byte */
19454             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19455                            - low_utf8;
19456
19457             /* We store the lowest possible first byte of the UTF-8
19458              * representation, using the flags field.  This allows for quick
19459              * ruling out of some inputs without having to convert from UTF-8
19460              * to code point.  For EBCDIC, we use I8, as not doing that
19461              * transformation would not rule out nearly so many things */
19462             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19463
19464             op = ANYOFH;
19465
19466             /* If the first UTF-8 start byte for the highest code point in the
19467              * range is suitably small, we may be able to get an upper bound as
19468              * well */
19469             if (highest_cp <= IV_MAX) {
19470                 U8 high_utf8[UTF8_MAXBYTES+1];
19471                 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19472                                 - high_utf8;
19473
19474                 /* If the lowest and highest are the same, we can get an exact
19475                  * first byte instead of a just minimum or even a sequence of
19476                  * exact leading bytes.  We signal these with different
19477                  * regnodes */
19478                 if (low_utf8[0] == high_utf8[0]) {
19479                     Size_t len = find_first_differing_byte_pos(low_utf8,
19480                                                                high_utf8,
19481                                                        MIN(low_len, high_len));
19482
19483                     if (len == 1) {
19484
19485                         /* No need to convert to I8 for EBCDIC as this is an
19486                          * exact match */
19487                         anyof_flags = low_utf8[0];
19488                         op = ANYOFHb;
19489                     }
19490                     else {
19491                         op = ANYOFHs;
19492                         ret = regnode_guts(pRExC_state, op,
19493                                            regarglen[op] + STR_SZ(len),
19494                                            "anyofhs");
19495                         FILL_NODE(ret, op);
19496                         ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19497                                                                         = len;
19498                         Copy(low_utf8,  /* Add the common bytes */
19499                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19500                            len, U8);
19501                         RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19502                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19503                                                   NULL, only_utf8_locale_list);
19504                         goto not_anyof;
19505                     }
19506                 }
19507                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19508                 {
19509
19510                     /* Here, the high byte is not the same as the low, but is
19511                      * small enough that its reasonable to have a loose upper
19512                      * bound, which is packed in with the strict lower bound.
19513                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19514                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
19515                      * is the same thing as UTF-8 */
19516
19517                     U8 bits = 0;
19518                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19519                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19520                                   - anyof_flags;
19521
19522                     if (range_diff <= max_range_diff / 8) {
19523                         bits = 3;
19524                     }
19525                     else if (range_diff <= max_range_diff / 4) {
19526                         bits = 2;
19527                     }
19528                     else if (range_diff <= max_range_diff / 2) {
19529                         bits = 1;
19530                     }
19531                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19532                     op = ANYOFHr;
19533                 }
19534             }
19535
19536             goto done_finding_op;
19537         }
19538     }   /* End of seeing if can optimize it into a different node */
19539
19540   is_anyof: /* It's going to be an ANYOF node. */
19541     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19542          ? ANYOFD
19543          : ((posixl)
19544             ? ANYOFPOSIXL
19545             : ((LOC)
19546                ? ANYOFL
19547                : ANYOF));
19548
19549   done_finding_op:
19550
19551     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19552     FILL_NODE(ret, op);        /* We set the argument later */
19553     RExC_emit += 1 + regarglen[op];
19554     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19555
19556     /* Here, <cp_list> contains all the code points we can determine at
19557      * compile time that match under all conditions.  Go through it, and
19558      * for things that belong in the bitmap, put them there, and delete from
19559      * <cp_list>.  While we are at it, see if everything above 255 is in the
19560      * list, and if so, set a flag to speed up execution */
19561
19562     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19563
19564     if (posixl) {
19565         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19566     }
19567
19568     if (invert) {
19569         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19570     }
19571
19572     /* Here, the bitmap has been populated with all the Latin1 code points that
19573      * always match.  Can now add to the overall list those that match only
19574      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19575      * */
19576     if (upper_latin1_only_utf8_matches) {
19577         if (cp_list) {
19578             _invlist_union(cp_list,
19579                            upper_latin1_only_utf8_matches,
19580                            &cp_list);
19581             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19582         }
19583         else {
19584             cp_list = upper_latin1_only_utf8_matches;
19585         }
19586         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19587     }
19588
19589     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19590                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19591                    ? listsv
19592                    : NULL,
19593                   only_utf8_locale_list);
19594     SvREFCNT_dec(cp_list);;
19595     SvREFCNT_dec(only_utf8_locale_list);
19596     return ret;
19597
19598   not_anyof:
19599
19600     /* Here, the node is getting optimized into something that's not an ANYOF
19601      * one.  Finish up. */
19602
19603     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19604                                            RExC_parse - orig_parse);;
19605     SvREFCNT_dec(cp_list);;
19606     SvREFCNT_dec(only_utf8_locale_list);
19607     return ret;
19608 }
19609
19610 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19611
19612 STATIC void
19613 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19614                 regnode* const node,
19615                 SV* const cp_list,
19616                 SV* const runtime_defns,
19617                 SV* const only_utf8_locale_list)
19618 {
19619     /* Sets the arg field of an ANYOF-type node 'node', using information about
19620      * the node passed-in.  If there is nothing outside the node's bitmap, the
19621      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19622      * the count returned by add_data(), having allocated and stored an array,
19623      * av, as follows:
19624      *
19625      *  av[0] stores the inversion list defining this class as far as known at
19626      *        this time, or PL_sv_undef if nothing definite is now known.
19627      *  av[1] stores the inversion list of code points that match only if the
19628      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
19629      *        av[2], or no entry otherwise.
19630      *  av[2] stores the list of user-defined properties whose subroutine
19631      *        definitions aren't known at this time, or no entry if none. */
19632
19633     UV n;
19634
19635     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19636
19637     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19638         assert(! (ANYOF_FLAGS(node)
19639                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19640         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19641     }
19642     else {
19643         AV * const av = newAV();
19644         SV *rv;
19645
19646         if (cp_list) {
19647             av_store(av, INVLIST_INDEX, SvREFCNT_inc(cp_list));
19648         }
19649
19650         if (only_utf8_locale_list) {
19651             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
19652                                           SvREFCNT_inc(only_utf8_locale_list));
19653         }
19654
19655         if (runtime_defns) {
19656             av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
19657         }
19658
19659         rv = newRV_noinc(MUTABLE_SV(av));
19660         n = add_data(pRExC_state, STR_WITH_LEN("s"));
19661         RExC_rxi->data->data[n] = (void*)rv;
19662         ARG_SET(node, n);
19663     }
19664 }
19665
19666 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19667 SV *
19668 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19669                                         const regnode* node,
19670                                         bool doinit,
19671                                         SV** listsvp,
19672                                         SV** only_utf8_locale_ptr,
19673                                         SV** output_invlist)
19674
19675 {
19676     /* For internal core use only.
19677      * Returns the inversion list for the input 'node' in the regex 'prog'.
19678      * If <doinit> is 'true', will attempt to create the inversion list if not
19679      *    already done.
19680      * If <listsvp> is non-null, will return the printable contents of the
19681      *    property definition.  This can be used to get debugging information
19682      *    even before the inversion list exists, by calling this function with
19683      *    'doinit' set to false, in which case the components that will be used
19684      *    to eventually create the inversion list are returned  (in a printable
19685      *    form).
19686      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19687      *    store an inversion list of code points that should match only if the
19688      *    execution-time locale is a UTF-8 one.
19689      * If <output_invlist> is not NULL, it is where this routine is to store an
19690      *    inversion list of the code points that would be instead returned in
19691      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
19692      *    when this parameter is used, is just the non-code point data that
19693      *    will go into creating the inversion list.  This currently should be just
19694      *    user-defined properties whose definitions were not known at compile
19695      *    time.  Using this parameter allows for easier manipulation of the
19696      *    inversion list's data by the caller.  It is illegal to call this
19697      *    function with this parameter set, but not <listsvp>
19698      *
19699      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
19700      * that, in spite of this function's name, the inversion list it returns
19701      * may include the bitmap data as well */
19702
19703     SV *si  = NULL;         /* Input initialization string */
19704     SV* invlist = NULL;
19705
19706     RXi_GET_DECL(prog, progi);
19707     const struct reg_data * const data = prog ? progi->data : NULL;
19708
19709     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19710     assert(! output_invlist || listsvp);
19711
19712     if (data && data->count) {
19713         const U32 n = ARG(node);
19714
19715         if (data->what[n] == 's') {
19716             SV * const rv = MUTABLE_SV(data->data[n]);
19717             AV * const av = MUTABLE_AV(SvRV(rv));
19718             SV **const ary = AvARRAY(av);
19719
19720             invlist = ary[INVLIST_INDEX];
19721
19722             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
19723                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
19724             }
19725
19726             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
19727                 si = ary[DEFERRED_USER_DEFINED_INDEX];
19728             }
19729
19730             if (doinit && (si || invlist)) {
19731                 if (si) {
19732                     bool user_defined;
19733                     SV * msg = newSVpvs_flags("", SVs_TEMP);
19734
19735                     SV * prop_definition = handle_user_defined_property(
19736                             "", 0, FALSE,   /* There is no \p{}, \P{} */
19737                             SvPVX_const(si)[1] - '0',   /* /i or not has been
19738                                                            stored here for just
19739                                                            this occasion */
19740                             TRUE,           /* run time */
19741                             FALSE,          /* This call must find the defn */
19742                             si,             /* The property definition  */
19743                             &user_defined,
19744                             msg,
19745                             0               /* base level call */
19746                            );
19747
19748                     if (SvCUR(msg)) {
19749                         assert(prop_definition == NULL);
19750
19751                         Perl_croak(aTHX_ "%" UTF8f,
19752                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
19753                     }
19754
19755                     if (invlist) {
19756                         _invlist_union(invlist, prop_definition, &invlist);
19757                         SvREFCNT_dec_NN(prop_definition);
19758                     }
19759                     else {
19760                         invlist = prop_definition;
19761                     }
19762
19763                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
19764                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
19765
19766                     ary[INVLIST_INDEX] = invlist;
19767                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
19768                                  ? ONLY_LOCALE_MATCHES_INDEX
19769                                  : INVLIST_INDEX);
19770                     si = NULL;
19771                 }
19772             }
19773         }
19774     }
19775
19776     /* If requested, return a printable version of what this ANYOF node matches
19777      * */
19778     if (listsvp) {
19779         SV* matches_string = NULL;
19780
19781         /* This function can be called at compile-time, before everything gets
19782          * resolved, in which case we return the currently best available
19783          * information, which is the string that will eventually be used to do
19784          * that resolving, 'si' */
19785         if (si) {
19786             /* Here, we only have 'si' (and possibly some passed-in data in
19787              * 'invlist', which is handled below)  If the caller only wants
19788              * 'si', use that.  */
19789             if (! output_invlist) {
19790                 matches_string = newSVsv(si);
19791             }
19792             else {
19793                 /* But if the caller wants an inversion list of the node, we
19794                  * need to parse 'si' and place as much as possible in the
19795                  * desired output inversion list, making 'matches_string' only
19796                  * contain the currently unresolvable things */
19797                 const char *si_string = SvPVX(si);
19798                 STRLEN remaining = SvCUR(si);
19799                 UV prev_cp = 0;
19800                 U8 count = 0;
19801
19802                 /* Ignore everything before the first new-line */
19803                 while (*si_string != '\n' && remaining > 0) {
19804                     si_string++;
19805                     remaining--;
19806                 }
19807                 assert(remaining > 0);
19808
19809                 si_string++;
19810                 remaining--;
19811
19812                 while (remaining > 0) {
19813
19814                     /* The data consists of just strings defining user-defined
19815                      * property names, but in prior incarnations, and perhaps
19816                      * somehow from pluggable regex engines, it could still
19817                      * hold hex code point definitions.  Each component of a
19818                      * range would be separated by a tab, and each range by a
19819                      * new-line.  If these are found, instead add them to the
19820                      * inversion list */
19821                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
19822                                      |PERL_SCAN_SILENT_NON_PORTABLE;
19823                     STRLEN len = remaining;
19824                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19825
19826                     /* If the hex decode routine found something, it should go
19827                      * up to the next \n */
19828                     if (   *(si_string + len) == '\n') {
19829                         if (count) {    /* 2nd code point on line */
19830                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19831                         }
19832                         else {
19833                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19834                         }
19835                         count = 0;
19836                         goto prepare_for_next_iteration;
19837                     }
19838
19839                     /* If the hex decode was instead for the lower range limit,
19840                      * save it, and go parse the upper range limit */
19841                     if (*(si_string + len) == '\t') {
19842                         assert(count == 0);
19843
19844                         prev_cp = cp;
19845                         count = 1;
19846                       prepare_for_next_iteration:
19847                         si_string += len + 1;
19848                         remaining -= len + 1;
19849                         continue;
19850                     }
19851
19852                     /* Here, didn't find a legal hex number.  Just add it from
19853                      * here to the next \n */
19854
19855                     remaining -= len;
19856                     while (*(si_string + len) != '\n' && remaining > 0) {
19857                         remaining--;
19858                         len++;
19859                     }
19860                     if (*(si_string + len) == '\n') {
19861                         len++;
19862                         remaining--;
19863                     }
19864                     if (matches_string) {
19865                         sv_catpvn(matches_string, si_string, len - 1);
19866                     }
19867                     else {
19868                         matches_string = newSVpvn(si_string, len - 1);
19869                     }
19870                     si_string += len;
19871                     sv_catpvs(matches_string, " ");
19872                 } /* end of loop through the text */
19873
19874                 assert(matches_string);
19875                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
19876                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19877                 }
19878             } /* end of has an 'si' */
19879         }
19880
19881         /* Add the stuff that's already known */
19882         if (invlist) {
19883
19884             /* Again, if the caller doesn't want the output inversion list, put
19885              * everything in 'matches-string' */
19886             if (! output_invlist) {
19887                 if ( ! matches_string) {
19888                     matches_string = newSVpvs("\n");
19889                 }
19890                 sv_catsv(matches_string, invlist_contents(invlist,
19891                                                   TRUE /* traditional style */
19892                                                   ));
19893             }
19894             else if (! *output_invlist) {
19895                 *output_invlist = invlist_clone(invlist, NULL);
19896             }
19897             else {
19898                 _invlist_union(*output_invlist, invlist, output_invlist);
19899             }
19900         }
19901
19902         *listsvp = matches_string;
19903     }
19904
19905     return invlist;
19906 }
19907 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19908
19909 /* reg_skipcomment()
19910
19911    Absorbs an /x style # comment from the input stream,
19912    returning a pointer to the first character beyond the comment, or if the
19913    comment terminates the pattern without anything following it, this returns
19914    one past the final character of the pattern (in other words, RExC_end) and
19915    sets the REG_RUN_ON_COMMENT_SEEN flag.
19916
19917    Note it's the callers responsibility to ensure that we are
19918    actually in /x mode
19919
19920 */
19921
19922 PERL_STATIC_INLINE char*
19923 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19924 {
19925     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19926
19927     assert(*p == '#');
19928
19929     while (p < RExC_end) {
19930         if (*(++p) == '\n') {
19931             return p+1;
19932         }
19933     }
19934
19935     /* we ran off the end of the pattern without ending the comment, so we have
19936      * to add an \n when wrapping */
19937     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19938     return p;
19939 }
19940
19941 STATIC void
19942 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19943                                 char ** p,
19944                                 const bool force_to_xmod
19945                          )
19946 {
19947     /* If the text at the current parse position '*p' is a '(?#...)' comment,
19948      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19949      * is /x whitespace, advance '*p' so that on exit it points to the first
19950      * byte past all such white space and comments */
19951
19952     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19953
19954     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19955
19956     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19957
19958     for (;;) {
19959         if (RExC_end - (*p) >= 3
19960             && *(*p)     == '('
19961             && *(*p + 1) == '?'
19962             && *(*p + 2) == '#')
19963         {
19964             while (*(*p) != ')') {
19965                 if ((*p) == RExC_end)
19966                     FAIL("Sequence (?#... not terminated");
19967                 (*p)++;
19968             }
19969             (*p)++;
19970             continue;
19971         }
19972
19973         if (use_xmod) {
19974             const char * save_p = *p;
19975             while ((*p) < RExC_end) {
19976                 STRLEN len;
19977                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19978                     (*p) += len;
19979                 }
19980                 else if (*(*p) == '#') {
19981                     (*p) = reg_skipcomment(pRExC_state, (*p));
19982                 }
19983                 else {
19984                     break;
19985                 }
19986             }
19987             if (*p != save_p) {
19988                 continue;
19989             }
19990         }
19991
19992         break;
19993     }
19994
19995     return;
19996 }
19997
19998 /* nextchar()
19999
20000    Advances the parse position by one byte, unless that byte is the beginning
20001    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
20002    those two cases, the parse position is advanced beyond all such comments and
20003    white space.
20004
20005    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20006 */
20007
20008 STATIC void
20009 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20010 {
20011     PERL_ARGS_ASSERT_NEXTCHAR;
20012
20013     if (RExC_parse < RExC_end) {
20014         assert(   ! UTF
20015                || UTF8_IS_INVARIANT(*RExC_parse)
20016                || UTF8_IS_START(*RExC_parse));
20017
20018         RExC_parse += (UTF)
20019                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20020                       : 1;
20021
20022         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20023                                 FALSE /* Don't force /x */ );
20024     }
20025 }
20026
20027 STATIC void
20028 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20029 {
20030     /* 'size' is the delta number of smallest regnode equivalents to add or
20031      * subtract from the current memory allocated to the regex engine being
20032      * constructed. */
20033
20034     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20035
20036     RExC_size += size;
20037
20038     Renewc(RExC_rxi,
20039            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20040                                                 /* +1 for REG_MAGIC */
20041            char,
20042            regexp_internal);
20043     if ( RExC_rxi == NULL )
20044         FAIL("Regexp out of space");
20045     RXi_SET(RExC_rx, RExC_rxi);
20046
20047     RExC_emit_start = RExC_rxi->program;
20048     if (size > 0) {
20049         Zero(REGNODE_p(RExC_emit), size, regnode);
20050     }
20051
20052 #ifdef RE_TRACK_PATTERN_OFFSETS
20053     Renew(RExC_offsets, 2*RExC_size+1, U32);
20054     if (size > 0) {
20055         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20056     }
20057     RExC_offsets[0] = RExC_size;
20058 #endif
20059 }
20060
20061 STATIC regnode_offset
20062 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20063 {
20064     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20065      * equivalents space.  It aligns and increments RExC_size
20066      *
20067      * It returns the regnode's offset into the regex engine program */
20068
20069     const regnode_offset ret = RExC_emit;
20070
20071     GET_RE_DEBUG_FLAGS_DECL;
20072
20073     PERL_ARGS_ASSERT_REGNODE_GUTS;
20074
20075     SIZE_ALIGN(RExC_size);
20076     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20077     NODE_ALIGN_FILL(REGNODE_p(ret));
20078 #ifndef RE_TRACK_PATTERN_OFFSETS
20079     PERL_UNUSED_ARG(name);
20080     PERL_UNUSED_ARG(op);
20081 #else
20082     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20083
20084     if (RExC_offsets) {         /* MJD */
20085         MJD_OFFSET_DEBUG(
20086               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20087               name, __LINE__,
20088               PL_reg_name[op],
20089               (UV)(RExC_emit) > RExC_offsets[0]
20090                 ? "Overwriting end of array!\n" : "OK",
20091               (UV)(RExC_emit),
20092               (UV)(RExC_parse - RExC_start),
20093               (UV)RExC_offsets[0]));
20094         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20095     }
20096 #endif
20097     return(ret);
20098 }
20099
20100 /*
20101 - reg_node - emit a node
20102 */
20103 STATIC regnode_offset /* Location. */
20104 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20105 {
20106     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20107     regnode_offset ptr = ret;
20108
20109     PERL_ARGS_ASSERT_REG_NODE;
20110
20111     assert(regarglen[op] == 0);
20112
20113     FILL_ADVANCE_NODE(ptr, op);
20114     RExC_emit = ptr;
20115     return(ret);
20116 }
20117
20118 /*
20119 - reganode - emit a node with an argument
20120 */
20121 STATIC regnode_offset /* Location. */
20122 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20123 {
20124     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20125     regnode_offset ptr = ret;
20126
20127     PERL_ARGS_ASSERT_REGANODE;
20128
20129     /* ANYOF are special cased to allow non-length 1 args */
20130     assert(regarglen[op] == 1);
20131
20132     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20133     RExC_emit = ptr;
20134     return(ret);
20135 }
20136
20137 STATIC regnode_offset
20138 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20139 {
20140     /* emit a node with U32 and I32 arguments */
20141
20142     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20143     regnode_offset ptr = ret;
20144
20145     PERL_ARGS_ASSERT_REG2LANODE;
20146
20147     assert(regarglen[op] == 2);
20148
20149     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20150     RExC_emit = ptr;
20151     return(ret);
20152 }
20153
20154 /*
20155 - reginsert - insert an operator in front of already-emitted operand
20156 *
20157 * That means that on exit 'operand' is the offset of the newly inserted
20158 * operator, and the original operand has been relocated.
20159 *
20160 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20161 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20162 *
20163 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20164 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20165 *
20166 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20167 */
20168 STATIC void
20169 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20170                   const regnode_offset operand, const U32 depth)
20171 {
20172     regnode *src;
20173     regnode *dst;
20174     regnode *place;
20175     const int offset = regarglen[(U8)op];
20176     const int size = NODE_STEP_REGNODE + offset;
20177     GET_RE_DEBUG_FLAGS_DECL;
20178
20179     PERL_ARGS_ASSERT_REGINSERT;
20180     PERL_UNUSED_CONTEXT;
20181     PERL_UNUSED_ARG(depth);
20182 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20183     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20184     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20185                                     studying. If this is wrong then we need to adjust RExC_recurse
20186                                     below like we do with RExC_open_parens/RExC_close_parens. */
20187     change_engine_size(pRExC_state, (Ptrdiff_t) size);
20188     src = REGNODE_p(RExC_emit);
20189     RExC_emit += size;
20190     dst = REGNODE_p(RExC_emit);
20191
20192     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20193      * and [perl #133871] shows this can lead to problems, so skip this
20194      * realignment of parens until a later pass when they are reliable */
20195     if (! IN_PARENS_PASS && RExC_open_parens) {
20196         int paren;
20197         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20198         /* remember that RExC_npar is rex->nparens + 1,
20199          * iow it is 1 more than the number of parens seen in
20200          * the pattern so far. */
20201         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20202             /* note, RExC_open_parens[0] is the start of the
20203              * regex, it can't move. RExC_close_parens[0] is the end
20204              * of the regex, it *can* move. */
20205             if ( paren && RExC_open_parens[paren] >= operand ) {
20206                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20207                 RExC_open_parens[paren] += size;
20208             } else {
20209                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20210             }
20211             if ( RExC_close_parens[paren] >= operand ) {
20212                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20213                 RExC_close_parens[paren] += size;
20214             } else {
20215                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20216             }
20217         }
20218     }
20219     if (RExC_end_op)
20220         RExC_end_op += size;
20221
20222     while (src > REGNODE_p(operand)) {
20223         StructCopy(--src, --dst, regnode);
20224 #ifdef RE_TRACK_PATTERN_OFFSETS
20225         if (RExC_offsets) {     /* MJD 20010112 */
20226             MJD_OFFSET_DEBUG(
20227                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20228                   "reginsert",
20229                   __LINE__,
20230                   PL_reg_name[op],
20231                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20232                     ? "Overwriting end of array!\n" : "OK",
20233                   (UV)REGNODE_OFFSET(src),
20234                   (UV)REGNODE_OFFSET(dst),
20235                   (UV)RExC_offsets[0]));
20236             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20237             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20238         }
20239 #endif
20240     }
20241
20242     place = REGNODE_p(operand); /* Op node, where operand used to be. */
20243 #ifdef RE_TRACK_PATTERN_OFFSETS
20244     if (RExC_offsets) {         /* MJD */
20245         MJD_OFFSET_DEBUG(
20246               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20247               "reginsert",
20248               __LINE__,
20249               PL_reg_name[op],
20250               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20251               ? "Overwriting end of array!\n" : "OK",
20252               (UV)REGNODE_OFFSET(place),
20253               (UV)(RExC_parse - RExC_start),
20254               (UV)RExC_offsets[0]));
20255         Set_Node_Offset(place, RExC_parse);
20256         Set_Node_Length(place, 1);
20257     }
20258 #endif
20259     src = NEXTOPER(place);
20260     FLAGS(place) = 0;
20261     FILL_NODE(operand, op);
20262
20263     /* Zero out any arguments in the new node */
20264     Zero(src, offset, regnode);
20265 }
20266
20267 /*
20268 - regtail - set the next-pointer at the end of a node chain of p to val.  If
20269             that value won't fit in the space available, instead returns FALSE.
20270             (Except asserts if we can't fit in the largest space the regex
20271             engine is designed for.)
20272 - SEE ALSO: regtail_study
20273 */
20274 STATIC bool
20275 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20276                 const regnode_offset p,
20277                 const regnode_offset val,
20278                 const U32 depth)
20279 {
20280     regnode_offset scan;
20281     GET_RE_DEBUG_FLAGS_DECL;
20282
20283     PERL_ARGS_ASSERT_REGTAIL;
20284 #ifndef DEBUGGING
20285     PERL_UNUSED_ARG(depth);
20286 #endif
20287
20288     /* Find last node. */
20289     scan = (regnode_offset) p;
20290     for (;;) {
20291         regnode * const temp = regnext(REGNODE_p(scan));
20292         DEBUG_PARSE_r({
20293             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20294             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20295             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
20296                 SvPV_nolen_const(RExC_mysv), scan,
20297                     (temp == NULL ? "->" : ""),
20298                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20299             );
20300         });
20301         if (temp == NULL)
20302             break;
20303         scan = REGNODE_OFFSET(temp);
20304     }
20305
20306     assert(val >= scan);
20307     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20308         assert((UV) (val - scan) <= U32_MAX);
20309         ARG_SET(REGNODE_p(scan), val - scan);
20310     }
20311     else {
20312         if (val - scan > U16_MAX) {
20313             /* Populate this with something that won't loop and will likely
20314              * lead to a crash if the caller ignores the failure return, and
20315              * execution continues */
20316             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20317             return FALSE;
20318         }
20319         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20320     }
20321
20322     return TRUE;
20323 }
20324
20325 #ifdef DEBUGGING
20326 /*
20327 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20328 - Look for optimizable sequences at the same time.
20329 - currently only looks for EXACT chains.
20330
20331 This is experimental code. The idea is to use this routine to perform
20332 in place optimizations on branches and groups as they are constructed,
20333 with the long term intention of removing optimization from study_chunk so
20334 that it is purely analytical.
20335
20336 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20337 to control which is which.
20338
20339 This used to return a value that was ignored.  It was a problem that it is
20340 #ifdef'd to be another function that didn't return a value.  khw has changed it
20341 so both currently return a pass/fail return.
20342
20343 */
20344 /* TODO: All four parms should be const */
20345
20346 STATIC bool
20347 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20348                       const regnode_offset val, U32 depth)
20349 {
20350     regnode_offset scan;
20351     U8 exact = PSEUDO;
20352 #ifdef EXPERIMENTAL_INPLACESCAN
20353     I32 min = 0;
20354 #endif
20355     GET_RE_DEBUG_FLAGS_DECL;
20356
20357     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20358
20359
20360     /* Find last node. */
20361
20362     scan = p;
20363     for (;;) {
20364         regnode * const temp = regnext(REGNODE_p(scan));
20365 #ifdef EXPERIMENTAL_INPLACESCAN
20366         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20367             bool unfolded_multi_char;   /* Unexamined in this routine */
20368             if (join_exact(pRExC_state, scan, &min,
20369                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20370                 return TRUE; /* Was return EXACT */
20371         }
20372 #endif
20373         if ( exact ) {
20374             switch (OP(REGNODE_p(scan))) {
20375                 case LEXACT:
20376                 case EXACT:
20377                 case LEXACT_REQ8:
20378                 case EXACT_REQ8:
20379                 case EXACTL:
20380                 case EXACTF:
20381                 case EXACTFU_S_EDGE:
20382                 case EXACTFAA_NO_TRIE:
20383                 case EXACTFAA:
20384                 case EXACTFU:
20385                 case EXACTFU_REQ8:
20386                 case EXACTFLU8:
20387                 case EXACTFUP:
20388                 case EXACTFL:
20389                         if( exact == PSEUDO )
20390                             exact= OP(REGNODE_p(scan));
20391                         else if ( exact != OP(REGNODE_p(scan)) )
20392                             exact= 0;
20393                 case NOTHING:
20394                     break;
20395                 default:
20396                     exact= 0;
20397             }
20398         }
20399         DEBUG_PARSE_r({
20400             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20401             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20402             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
20403                 SvPV_nolen_const(RExC_mysv),
20404                 scan,
20405                 PL_reg_name[exact]);
20406         });
20407         if (temp == NULL)
20408             break;
20409         scan = REGNODE_OFFSET(temp);
20410     }
20411     DEBUG_PARSE_r({
20412         DEBUG_PARSE_MSG("");
20413         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20414         Perl_re_printf( aTHX_
20415                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20416                       SvPV_nolen_const(RExC_mysv),
20417                       (IV)val,
20418                       (IV)(val - scan)
20419         );
20420     });
20421     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20422         assert((UV) (val - scan) <= U32_MAX);
20423         ARG_SET(REGNODE_p(scan), val - scan);
20424     }
20425     else {
20426         if (val - scan > U16_MAX) {
20427             /* Populate this with something that won't loop and will likely
20428              * lead to a crash if the caller ignores the failure return, and
20429              * execution continues */
20430             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20431             return FALSE;
20432         }
20433         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20434     }
20435
20436     return TRUE; /* Was 'return exact' */
20437 }
20438 #endif
20439
20440 STATIC SV*
20441 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20442
20443     /* Returns an inversion list of all the code points matched by the
20444      * ANYOFM/NANYOFM node 'n' */
20445
20446     SV * cp_list = _new_invlist(-1);
20447     const U8 lowest = (U8) ARG(n);
20448     unsigned int i;
20449     U8 count = 0;
20450     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20451
20452     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20453
20454     /* Starting with the lowest code point, any code point that ANDed with the
20455      * mask yields the lowest code point is in the set */
20456     for (i = lowest; i <= 0xFF; i++) {
20457         if ((i & FLAGS(n)) == ARG(n)) {
20458             cp_list = add_cp_to_invlist(cp_list, i);
20459             count++;
20460
20461             /* We know how many code points (a power of two) that are in the
20462              * set.  No use looking once we've got that number */
20463             if (count >= needed) break;
20464         }
20465     }
20466
20467     if (OP(n) == NANYOFM) {
20468         _invlist_invert(cp_list);
20469     }
20470     return cp_list;
20471 }
20472
20473 /*
20474  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20475  */
20476 #ifdef DEBUGGING
20477
20478 static void
20479 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20480 {
20481     int bit;
20482     int set=0;
20483
20484     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20485
20486     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20487         if (flags & (1<<bit)) {
20488             if (!set++ && lead)
20489                 Perl_re_printf( aTHX_  "%s", lead);
20490             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20491         }
20492     }
20493     if (lead)  {
20494         if (set)
20495             Perl_re_printf( aTHX_  "\n");
20496         else
20497             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20498     }
20499 }
20500
20501 static void
20502 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20503 {
20504     int bit;
20505     int set=0;
20506     regex_charset cs;
20507
20508     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20509
20510     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20511         if (flags & (1<<bit)) {
20512             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
20513                 continue;
20514             }
20515             if (!set++ && lead)
20516                 Perl_re_printf( aTHX_  "%s", lead);
20517             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20518         }
20519     }
20520     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20521             if (!set++ && lead) {
20522                 Perl_re_printf( aTHX_  "%s", lead);
20523             }
20524             switch (cs) {
20525                 case REGEX_UNICODE_CHARSET:
20526                     Perl_re_printf( aTHX_  "UNICODE");
20527                     break;
20528                 case REGEX_LOCALE_CHARSET:
20529                     Perl_re_printf( aTHX_  "LOCALE");
20530                     break;
20531                 case REGEX_ASCII_RESTRICTED_CHARSET:
20532                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20533                     break;
20534                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20535                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20536                     break;
20537                 default:
20538                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20539                     break;
20540             }
20541     }
20542     if (lead)  {
20543         if (set)
20544             Perl_re_printf( aTHX_  "\n");
20545         else
20546             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20547     }
20548 }
20549 #endif
20550
20551 void
20552 Perl_regdump(pTHX_ const regexp *r)
20553 {
20554 #ifdef DEBUGGING
20555     int i;
20556     SV * const sv = sv_newmortal();
20557     SV *dsv= sv_newmortal();
20558     RXi_GET_DECL(r, ri);
20559     GET_RE_DEBUG_FLAGS_DECL;
20560
20561     PERL_ARGS_ASSERT_REGDUMP;
20562
20563     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20564
20565     /* Header fields of interest. */
20566     for (i = 0; i < 2; i++) {
20567         if (r->substrs->data[i].substr) {
20568             RE_PV_QUOTED_DECL(s, 0, dsv,
20569                             SvPVX_const(r->substrs->data[i].substr),
20570                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20571                             PL_dump_re_max_len);
20572             Perl_re_printf( aTHX_
20573                           "%s %s%s at %" IVdf "..%" UVuf " ",
20574                           i ? "floating" : "anchored",
20575                           s,
20576                           RE_SV_TAIL(r->substrs->data[i].substr),
20577                           (IV)r->substrs->data[i].min_offset,
20578                           (UV)r->substrs->data[i].max_offset);
20579         }
20580         else if (r->substrs->data[i].utf8_substr) {
20581             RE_PV_QUOTED_DECL(s, 1, dsv,
20582                             SvPVX_const(r->substrs->data[i].utf8_substr),
20583                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20584                             30);
20585             Perl_re_printf( aTHX_
20586                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20587                           i ? "floating" : "anchored",
20588                           s,
20589                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20590                           (IV)r->substrs->data[i].min_offset,
20591                           (UV)r->substrs->data[i].max_offset);
20592         }
20593     }
20594
20595     if (r->check_substr || r->check_utf8)
20596         Perl_re_printf( aTHX_
20597                       (const char *)
20598                       (   r->check_substr == r->substrs->data[1].substr
20599                        && r->check_utf8   == r->substrs->data[1].utf8_substr
20600                        ? "(checking floating" : "(checking anchored"));
20601     if (r->intflags & PREGf_NOSCAN)
20602         Perl_re_printf( aTHX_  " noscan");
20603     if (r->extflags & RXf_CHECK_ALL)
20604         Perl_re_printf( aTHX_  " isall");
20605     if (r->check_substr || r->check_utf8)
20606         Perl_re_printf( aTHX_  ") ");
20607
20608     if (ri->regstclass) {
20609         regprop(r, sv, ri->regstclass, NULL, NULL);
20610         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
20611     }
20612     if (r->intflags & PREGf_ANCH) {
20613         Perl_re_printf( aTHX_  "anchored");
20614         if (r->intflags & PREGf_ANCH_MBOL)
20615             Perl_re_printf( aTHX_  "(MBOL)");
20616         if (r->intflags & PREGf_ANCH_SBOL)
20617             Perl_re_printf( aTHX_  "(SBOL)");
20618         if (r->intflags & PREGf_ANCH_GPOS)
20619             Perl_re_printf( aTHX_  "(GPOS)");
20620         Perl_re_printf( aTHX_ " ");
20621     }
20622     if (r->intflags & PREGf_GPOS_SEEN)
20623         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
20624     if (r->intflags & PREGf_SKIP)
20625         Perl_re_printf( aTHX_  "plus ");
20626     if (r->intflags & PREGf_IMPLICIT)
20627         Perl_re_printf( aTHX_  "implicit ");
20628     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
20629     if (r->extflags & RXf_EVAL_SEEN)
20630         Perl_re_printf( aTHX_  "with eval ");
20631     Perl_re_printf( aTHX_  "\n");
20632     DEBUG_FLAGS_r({
20633         regdump_extflags("r->extflags: ", r->extflags);
20634         regdump_intflags("r->intflags: ", r->intflags);
20635     });
20636 #else
20637     PERL_ARGS_ASSERT_REGDUMP;
20638     PERL_UNUSED_CONTEXT;
20639     PERL_UNUSED_ARG(r);
20640 #endif  /* DEBUGGING */
20641 }
20642
20643 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20644 #ifdef DEBUGGING
20645
20646 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
20647      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
20648      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
20649      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
20650      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
20651      || _CC_VERTSPACE != 15
20652 #   error Need to adjust order of anyofs[]
20653 #  endif
20654 static const char * const anyofs[] = {
20655     "\\w",
20656     "\\W",
20657     "\\d",
20658     "\\D",
20659     "[:alpha:]",
20660     "[:^alpha:]",
20661     "[:lower:]",
20662     "[:^lower:]",
20663     "[:upper:]",
20664     "[:^upper:]",
20665     "[:punct:]",
20666     "[:^punct:]",
20667     "[:print:]",
20668     "[:^print:]",
20669     "[:alnum:]",
20670     "[:^alnum:]",
20671     "[:graph:]",
20672     "[:^graph:]",
20673     "[:cased:]",
20674     "[:^cased:]",
20675     "\\s",
20676     "\\S",
20677     "[:blank:]",
20678     "[:^blank:]",
20679     "[:xdigit:]",
20680     "[:^xdigit:]",
20681     "[:cntrl:]",
20682     "[:^cntrl:]",
20683     "[:ascii:]",
20684     "[:^ascii:]",
20685     "\\v",
20686     "\\V"
20687 };
20688 #endif
20689
20690 /*
20691 - regprop - printable representation of opcode, with run time support
20692 */
20693
20694 void
20695 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20696 {
20697 #ifdef DEBUGGING
20698     dVAR;
20699     int k;
20700     RXi_GET_DECL(prog, progi);
20701     GET_RE_DEBUG_FLAGS_DECL;
20702
20703     PERL_ARGS_ASSERT_REGPROP;
20704
20705     SvPVCLEAR(sv);
20706
20707     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
20708         if (pRExC_state) {  /* This gives more info, if we have it */
20709             FAIL3("panic: corrupted regexp opcode %d > %d",
20710                   (int)OP(o), (int)REGNODE_MAX);
20711         }
20712         else {
20713             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
20714                              (int)OP(o), (int)REGNODE_MAX);
20715         }
20716     }
20717     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20718
20719     k = PL_regkind[OP(o)];
20720
20721     if (k == EXACT) {
20722         sv_catpvs(sv, " ");
20723         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20724          * is a crude hack but it may be the best for now since
20725          * we have no flag "this EXACTish node was UTF-8"
20726          * --jhi */
20727         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20728                   PL_colors[0], PL_colors[1],
20729                   PERL_PV_ESCAPE_UNI_DETECT |
20730                   PERL_PV_ESCAPE_NONASCII   |
20731                   PERL_PV_PRETTY_ELLIPSES   |
20732                   PERL_PV_PRETTY_LTGT       |
20733                   PERL_PV_PRETTY_NOCLEAR
20734                   );
20735     } else if (k == TRIE) {
20736         /* print the details of the trie in dumpuntil instead, as
20737          * progi->data isn't available here */
20738         const char op = OP(o);
20739         const U32 n = ARG(o);
20740         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20741                (reg_ac_data *)progi->data->data[n] :
20742                NULL;
20743         const reg_trie_data * const trie
20744             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20745
20746         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20747         DEBUG_TRIE_COMPILE_r({
20748           if (trie->jump)
20749             sv_catpvs(sv, "(JUMP)");
20750           Perl_sv_catpvf(aTHX_ sv,
20751             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20752             (UV)trie->startstate,
20753             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20754             (UV)trie->wordcount,
20755             (UV)trie->minlen,
20756             (UV)trie->maxlen,
20757             (UV)TRIE_CHARCOUNT(trie),
20758             (UV)trie->uniquecharcount
20759           );
20760         });
20761         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20762             sv_catpvs(sv, "[");
20763             (void) put_charclass_bitmap_innards(sv,
20764                                                 ((IS_ANYOF_TRIE(op))
20765                                                  ? ANYOF_BITMAP(o)
20766                                                  : TRIE_BITMAP(trie)),
20767                                                 NULL,
20768                                                 NULL,
20769                                                 NULL,
20770                                                 0,
20771                                                 FALSE
20772                                                );
20773             sv_catpvs(sv, "]");
20774         }
20775     } else if (k == CURLY) {
20776         U32 lo = ARG1(o), hi = ARG2(o);
20777         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20778             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20779         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20780         if (hi == REG_INFTY)
20781             sv_catpvs(sv, "INFTY");
20782         else
20783             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20784         sv_catpvs(sv, "}");
20785     }
20786     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
20787         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20788     else if (k == REF || k == OPEN || k == CLOSE
20789              || k == GROUPP || OP(o)==ACCEPT)
20790     {
20791         AV *name_list= NULL;
20792         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20793         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
20794         if ( RXp_PAREN_NAMES(prog) ) {
20795             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20796         } else if ( pRExC_state ) {
20797             name_list= RExC_paren_name_list;
20798         }
20799         if (name_list) {
20800             if ( k != REF || (OP(o) < REFN)) {
20801                 SV **name= av_fetch(name_list, parno, 0 );
20802                 if (name)
20803                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20804             }
20805             else {
20806                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20807                 I32 *nums=(I32*)SvPVX(sv_dat);
20808                 SV **name= av_fetch(name_list, nums[0], 0 );
20809                 I32 n;
20810                 if (name) {
20811                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
20812                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20813                                     (n ? "," : ""), (IV)nums[n]);
20814                     }
20815                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20816                 }
20817             }
20818         }
20819         if ( k == REF && reginfo) {
20820             U32 n = ARG(o);  /* which paren pair */
20821             I32 ln = prog->offs[n].start;
20822             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20823                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20824             else if (ln == prog->offs[n].end)
20825                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20826             else {
20827                 const char *s = reginfo->strbeg + ln;
20828                 Perl_sv_catpvf(aTHX_ sv, ": ");
20829                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20830                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20831             }
20832         }
20833     } else if (k == GOSUB) {
20834         AV *name_list= NULL;
20835         if ( RXp_PAREN_NAMES(prog) ) {
20836             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20837         } else if ( pRExC_state ) {
20838             name_list= RExC_paren_name_list;
20839         }
20840
20841         /* Paren and offset */
20842         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20843                 (int)((o + (int)ARG2L(o)) - progi->program) );
20844         if (name_list) {
20845             SV **name= av_fetch(name_list, ARG(o), 0 );
20846             if (name)
20847                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20848         }
20849     }
20850     else if (k == LOGICAL)
20851         /* 2: embedded, otherwise 1 */
20852         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20853     else if (k == ANYOF || k == ANYOFR) {
20854         U8 flags;
20855         char * bitmap;
20856         U32 arg;
20857         bool do_sep = FALSE;    /* Do we need to separate various components of
20858                                    the output? */
20859         /* Set if there is still an unresolved user-defined property */
20860         SV *unresolved                = NULL;
20861
20862         /* Things that are ignored except when the runtime locale is UTF-8 */
20863         SV *only_utf8_locale_invlist = NULL;
20864
20865         /* Code points that don't fit in the bitmap */
20866         SV *nonbitmap_invlist = NULL;
20867
20868         /* And things that aren't in the bitmap, but are small enough to be */
20869         SV* bitmap_range_not_in_bitmap = NULL;
20870
20871         bool inverted;
20872
20873         if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
20874             flags = 0;
20875             bitmap = NULL;
20876             arg = 0;
20877         }
20878         else {
20879             flags = ANYOF_FLAGS(o);
20880             bitmap = ANYOF_BITMAP(o);
20881             arg = ARG(o);
20882         }
20883
20884         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20885             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20886                 sv_catpvs(sv, "{utf8-locale-reqd}");
20887             }
20888             if (flags & ANYOFL_FOLD) {
20889                 sv_catpvs(sv, "{i}");
20890             }
20891         }
20892
20893         inverted = flags & ANYOF_INVERT;
20894
20895         /* If there is stuff outside the bitmap, get it */
20896         if (arg != ANYOF_ONLY_HAS_BITMAP) {
20897             if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
20898                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20899                                             ANYOFRbase(o),
20900                                             ANYOFRbase(o) + ANYOFRdelta(o));
20901             }
20902             else {
20903                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20904                                                 &unresolved,
20905                                                 &only_utf8_locale_invlist,
20906                                                 &nonbitmap_invlist);
20907             }
20908
20909             /* The non-bitmap data may contain stuff that could fit in the
20910              * bitmap.  This could come from a user-defined property being
20911              * finally resolved when this call was done; or much more likely
20912              * because there are matches that require UTF-8 to be valid, and so
20913              * aren't in the bitmap (or ANYOFR).  This is teased apart later */
20914             _invlist_intersection(nonbitmap_invlist,
20915                                   PL_InBitmap,
20916                                   &bitmap_range_not_in_bitmap);
20917             /* Leave just the things that don't fit into the bitmap */
20918             _invlist_subtract(nonbitmap_invlist,
20919                               PL_InBitmap,
20920                               &nonbitmap_invlist);
20921         }
20922
20923         /* Obey this flag to add all above-the-bitmap code points */
20924         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20925             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20926                                                       NUM_ANYOF_CODE_POINTS,
20927                                                       UV_MAX);
20928         }
20929
20930         /* Ready to start outputting.  First, the initial left bracket */
20931         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20932
20933         /* ANYOFH by definition doesn't have anything that will fit inside the
20934          * bitmap;  ANYOFR may or may not. */
20935         if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
20936             && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
20937                 ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
20938         {
20939             /* Then all the things that could fit in the bitmap */
20940             do_sep = put_charclass_bitmap_innards(sv,
20941                                                   bitmap,
20942                                                   bitmap_range_not_in_bitmap,
20943                                                   only_utf8_locale_invlist,
20944                                                   o,
20945                                                   flags,
20946
20947                                                   /* Can't try inverting for a
20948                                                    * better display if there
20949                                                    * are things that haven't
20950                                                    * been resolved */
20951                                                   unresolved != NULL
20952                                             || inRANGE(OP(o), ANYOFR, ANYOFRb));
20953             SvREFCNT_dec(bitmap_range_not_in_bitmap);
20954
20955             /* If there are user-defined properties which haven't been defined
20956              * yet, output them.  If the result is not to be inverted, it is
20957              * clearest to output them in a separate [] from the bitmap range
20958              * stuff.  If the result is to be complemented, we have to show
20959              * everything in one [], as the inversion applies to the whole
20960              * thing.  Use {braces} to separate them from anything in the
20961              * bitmap and anything above the bitmap. */
20962             if (unresolved) {
20963                 if (inverted) {
20964                     if (! do_sep) { /* If didn't output anything in the bitmap
20965                                      */
20966                         sv_catpvs(sv, "^");
20967                     }
20968                     sv_catpvs(sv, "{");
20969                 }
20970                 else if (do_sep) {
20971                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20972                                                       PL_colors[0]);
20973                 }
20974                 sv_catsv(sv, unresolved);
20975                 if (inverted) {
20976                     sv_catpvs(sv, "}");
20977                 }
20978                 do_sep = ! inverted;
20979             }
20980         }
20981
20982         /* And, finally, add the above-the-bitmap stuff */
20983         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
20984             SV* contents;
20985
20986             /* See if truncation size is overridden */
20987             const STRLEN dump_len = (PL_dump_re_max_len > 256)
20988                                     ? PL_dump_re_max_len
20989                                     : 256;
20990
20991             /* This is output in a separate [] */
20992             if (do_sep) {
20993                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20994             }
20995
20996             /* And, for easy of understanding, it is shown in the
20997              * uncomplemented form if possible.  The one exception being if
20998              * there are unresolved items, where the inversion has to be
20999              * delayed until runtime */
21000             if (inverted && ! unresolved) {
21001                 _invlist_invert(nonbitmap_invlist);
21002                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21003             }
21004
21005             contents = invlist_contents(nonbitmap_invlist,
21006                                         FALSE /* output suitable for catsv */
21007                                        );
21008
21009             /* If the output is shorter than the permissible maximum, just do it. */
21010             if (SvCUR(contents) <= dump_len) {
21011                 sv_catsv(sv, contents);
21012             }
21013             else {
21014                 const char * contents_string = SvPVX(contents);
21015                 STRLEN i = dump_len;
21016
21017                 /* Otherwise, start at the permissible max and work back to the
21018                  * first break possibility */
21019                 while (i > 0 && contents_string[i] != ' ') {
21020                     i--;
21021                 }
21022                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
21023                                        find a legal break */
21024                     i = dump_len;
21025                 }
21026
21027                 sv_catpvn(sv, contents_string, i);
21028                 sv_catpvs(sv, "...");
21029             }
21030
21031             SvREFCNT_dec_NN(contents);
21032             SvREFCNT_dec_NN(nonbitmap_invlist);
21033         }
21034
21035         /* And finally the matching, closing ']' */
21036         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21037
21038         if (OP(o) == ANYOFHs) {
21039             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21040         }
21041         else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21042             U8 lowest = (OP(o) != ANYOFHr)
21043                          ? FLAGS(o)
21044                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21045             U8 highest = (OP(o) == ANYOFHr)
21046                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21047                          : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21048                            ? 0xFF
21049                            : lowest;
21050             Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21051             if (lowest != highest) {
21052                 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21053             }
21054             Perl_sv_catpvf(aTHX_ sv, ")");
21055         }
21056
21057         SvREFCNT_dec(unresolved);
21058     }
21059     else if (k == ANYOFM) {
21060         SV * cp_list = get_ANYOFM_contents(o);
21061
21062         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21063         if (OP(o) == NANYOFM) {
21064             _invlist_invert(cp_list);
21065         }
21066
21067         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21068         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21069
21070         SvREFCNT_dec(cp_list);
21071     }
21072     else if (k == POSIXD || k == NPOSIXD) {
21073         U8 index = FLAGS(o) * 2;
21074         if (index < C_ARRAY_LENGTH(anyofs)) {
21075             if (*anyofs[index] != '[')  {
21076                 sv_catpvs(sv, "[");
21077             }
21078             sv_catpv(sv, anyofs[index]);
21079             if (*anyofs[index] != '[')  {
21080                 sv_catpvs(sv, "]");
21081             }
21082         }
21083         else {
21084             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21085         }
21086     }
21087     else if (k == BOUND || k == NBOUND) {
21088         /* Must be synced with order of 'bound_type' in regcomp.h */
21089         const char * const bounds[] = {
21090             "",      /* Traditional */
21091             "{gcb}",
21092             "{lb}",
21093             "{sb}",
21094             "{wb}"
21095         };
21096         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21097         sv_catpv(sv, bounds[FLAGS(o)]);
21098     }
21099     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21100         Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21101         if (o->next_off) {
21102             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21103         }
21104         Perl_sv_catpvf(aTHX_ sv, "]");
21105     }
21106     else if (OP(o) == SBOL)
21107         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21108
21109     /* add on the verb argument if there is one */
21110     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21111         if ( ARG(o) )
21112             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21113                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21114         else
21115             sv_catpvs(sv, ":NULL");
21116     }
21117 #else
21118     PERL_UNUSED_CONTEXT;
21119     PERL_UNUSED_ARG(sv);
21120     PERL_UNUSED_ARG(o);
21121     PERL_UNUSED_ARG(prog);
21122     PERL_UNUSED_ARG(reginfo);
21123     PERL_UNUSED_ARG(pRExC_state);
21124 #endif  /* DEBUGGING */
21125 }
21126
21127
21128
21129 SV *
21130 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21131 {                               /* Assume that RE_INTUIT is set */
21132     struct regexp *const prog = ReANY(r);
21133     GET_RE_DEBUG_FLAGS_DECL;
21134
21135     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21136     PERL_UNUSED_CONTEXT;
21137
21138     DEBUG_COMPILE_r(
21139         {
21140             const char * const s = SvPV_nolen_const(RX_UTF8(r)
21141                       ? prog->check_utf8 : prog->check_substr);
21142
21143             if (!PL_colorset) reginitcolors();
21144             Perl_re_printf( aTHX_
21145                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21146                       PL_colors[4],
21147                       RX_UTF8(r) ? "utf8 " : "",
21148                       PL_colors[5], PL_colors[0],
21149                       s,
21150                       PL_colors[1],
21151                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21152         } );
21153
21154     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21155     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21156 }
21157
21158 /*
21159    pregfree()
21160
21161    handles refcounting and freeing the perl core regexp structure. When
21162    it is necessary to actually free the structure the first thing it
21163    does is call the 'free' method of the regexp_engine associated to
21164    the regexp, allowing the handling of the void *pprivate; member
21165    first. (This routine is not overridable by extensions, which is why
21166    the extensions free is called first.)
21167
21168    See regdupe and regdupe_internal if you change anything here.
21169 */
21170 #ifndef PERL_IN_XSUB_RE
21171 void
21172 Perl_pregfree(pTHX_ REGEXP *r)
21173 {
21174     SvREFCNT_dec(r);
21175 }
21176
21177 void
21178 Perl_pregfree2(pTHX_ REGEXP *rx)
21179 {
21180     struct regexp *const r = ReANY(rx);
21181     GET_RE_DEBUG_FLAGS_DECL;
21182
21183     PERL_ARGS_ASSERT_PREGFREE2;
21184
21185     if (! r)
21186         return;
21187
21188     if (r->mother_re) {
21189         ReREFCNT_dec(r->mother_re);
21190     } else {
21191         CALLREGFREE_PVT(rx); /* free the private data */
21192         SvREFCNT_dec(RXp_PAREN_NAMES(r));
21193     }
21194     if (r->substrs) {
21195         int i;
21196         for (i = 0; i < 2; i++) {
21197             SvREFCNT_dec(r->substrs->data[i].substr);
21198             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21199         }
21200         Safefree(r->substrs);
21201     }
21202     RX_MATCH_COPY_FREE(rx);
21203 #ifdef PERL_ANY_COW
21204     SvREFCNT_dec(r->saved_copy);
21205 #endif
21206     Safefree(r->offs);
21207     SvREFCNT_dec(r->qr_anoncv);
21208     if (r->recurse_locinput)
21209         Safefree(r->recurse_locinput);
21210 }
21211
21212
21213 /*  reg_temp_copy()
21214
21215     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21216     except that dsv will be created if NULL.
21217
21218     This function is used in two main ways. First to implement
21219         $r = qr/....; $s = $$r;
21220
21221     Secondly, it is used as a hacky workaround to the structural issue of
21222     match results
21223     being stored in the regexp structure which is in turn stored in
21224     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21225     could be PL_curpm in multiple contexts, and could require multiple
21226     result sets being associated with the pattern simultaneously, such
21227     as when doing a recursive match with (??{$qr})
21228
21229     The solution is to make a lightweight copy of the regexp structure
21230     when a qr// is returned from the code executed by (??{$qr}) this
21231     lightweight copy doesn't actually own any of its data except for
21232     the starp/end and the actual regexp structure itself.
21233
21234 */
21235
21236
21237 REGEXP *
21238 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21239 {
21240     struct regexp *drx;
21241     struct regexp *const srx = ReANY(ssv);
21242     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21243
21244     PERL_ARGS_ASSERT_REG_TEMP_COPY;
21245
21246     if (!dsv)
21247         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21248     else {
21249         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21250
21251         /* our only valid caller, sv_setsv_flags(), should have done
21252          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21253         assert(!SvOOK(dsv));
21254         assert(!SvIsCOW(dsv));
21255         assert(!SvROK(dsv));
21256
21257         if (SvPVX_const(dsv)) {
21258             if (SvLEN(dsv))
21259                 Safefree(SvPVX(dsv));
21260             SvPVX(dsv) = NULL;
21261         }
21262         SvLEN_set(dsv, 0);
21263         SvCUR_set(dsv, 0);
21264         SvOK_off((SV *)dsv);
21265
21266         if (islv) {
21267             /* For PVLVs, the head (sv_any) points to an XPVLV, while
21268              * the LV's xpvlenu_rx will point to a regexp body, which
21269              * we allocate here */
21270             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21271             assert(!SvPVX(dsv));
21272             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21273             temp->sv_any = NULL;
21274             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21275             SvREFCNT_dec_NN(temp);
21276             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21277                ing below will not set it. */
21278             SvCUR_set(dsv, SvCUR(ssv));
21279         }
21280     }
21281     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21282        sv_force_normal(sv) is called.  */
21283     SvFAKE_on(dsv);
21284     drx = ReANY(dsv);
21285
21286     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21287     SvPV_set(dsv, RX_WRAPPED(ssv));
21288     /* We share the same string buffer as the original regexp, on which we
21289        hold a reference count, incremented when mother_re is set below.
21290        The string pointer is copied here, being part of the regexp struct.
21291      */
21292     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21293            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21294     if (!islv)
21295         SvLEN_set(dsv, 0);
21296     if (srx->offs) {
21297         const I32 npar = srx->nparens+1;
21298         Newx(drx->offs, npar, regexp_paren_pair);
21299         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21300     }
21301     if (srx->substrs) {
21302         int i;
21303         Newx(drx->substrs, 1, struct reg_substr_data);
21304         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21305
21306         for (i = 0; i < 2; i++) {
21307             SvREFCNT_inc_void(drx->substrs->data[i].substr);
21308             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21309         }
21310
21311         /* check_substr and check_utf8, if non-NULL, point to either their
21312            anchored or float namesakes, and don't hold a second reference.  */
21313     }
21314     RX_MATCH_COPIED_off(dsv);
21315 #ifdef PERL_ANY_COW
21316     drx->saved_copy = NULL;
21317 #endif
21318     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21319     SvREFCNT_inc_void(drx->qr_anoncv);
21320     if (srx->recurse_locinput)
21321         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21322
21323     return dsv;
21324 }
21325 #endif
21326
21327
21328 /* regfree_internal()
21329
21330    Free the private data in a regexp. This is overloadable by
21331    extensions. Perl takes care of the regexp structure in pregfree(),
21332    this covers the *pprivate pointer which technically perl doesn't
21333    know about, however of course we have to handle the
21334    regexp_internal structure when no extension is in use.
21335
21336    Note this is called before freeing anything in the regexp
21337    structure.
21338  */
21339
21340 void
21341 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21342 {
21343     struct regexp *const r = ReANY(rx);
21344     RXi_GET_DECL(r, ri);
21345     GET_RE_DEBUG_FLAGS_DECL;
21346
21347     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21348
21349     if (! ri) {
21350         return;
21351     }
21352
21353     DEBUG_COMPILE_r({
21354         if (!PL_colorset)
21355             reginitcolors();
21356         {
21357             SV *dsv= sv_newmortal();
21358             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21359                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21360             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21361                 PL_colors[4], PL_colors[5], s);
21362         }
21363     });
21364
21365 #ifdef RE_TRACK_PATTERN_OFFSETS
21366     if (ri->u.offsets)
21367         Safefree(ri->u.offsets);             /* 20010421 MJD */
21368 #endif
21369     if (ri->code_blocks)
21370         S_free_codeblocks(aTHX_ ri->code_blocks);
21371
21372     if (ri->data) {
21373         int n = ri->data->count;
21374
21375         while (--n >= 0) {
21376           /* If you add a ->what type here, update the comment in regcomp.h */
21377             switch (ri->data->what[n]) {
21378             case 'a':
21379             case 'r':
21380             case 's':
21381             case 'S':
21382             case 'u':
21383                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21384                 break;
21385             case 'f':
21386                 Safefree(ri->data->data[n]);
21387                 break;
21388             case 'l':
21389             case 'L':
21390                 break;
21391             case 'T':
21392                 { /* Aho Corasick add-on structure for a trie node.
21393                      Used in stclass optimization only */
21394                     U32 refcount;
21395                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21396 #ifdef USE_ITHREADS
21397                     dVAR;
21398 #endif
21399                     OP_REFCNT_LOCK;
21400                     refcount = --aho->refcount;
21401                     OP_REFCNT_UNLOCK;
21402                     if ( !refcount ) {
21403                         PerlMemShared_free(aho->states);
21404                         PerlMemShared_free(aho->fail);
21405                          /* do this last!!!! */
21406                         PerlMemShared_free(ri->data->data[n]);
21407                         /* we should only ever get called once, so
21408                          * assert as much, and also guard the free
21409                          * which /might/ happen twice. At the least
21410                          * it will make code anlyzers happy and it
21411                          * doesn't cost much. - Yves */
21412                         assert(ri->regstclass);
21413                         if (ri->regstclass) {
21414                             PerlMemShared_free(ri->regstclass);
21415                             ri->regstclass = 0;
21416                         }
21417                     }
21418                 }
21419                 break;
21420             case 't':
21421                 {
21422                     /* trie structure. */
21423                     U32 refcount;
21424                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21425 #ifdef USE_ITHREADS
21426                     dVAR;
21427 #endif
21428                     OP_REFCNT_LOCK;
21429                     refcount = --trie->refcount;
21430                     OP_REFCNT_UNLOCK;
21431                     if ( !refcount ) {
21432                         PerlMemShared_free(trie->charmap);
21433                         PerlMemShared_free(trie->states);
21434                         PerlMemShared_free(trie->trans);
21435                         if (trie->bitmap)
21436                             PerlMemShared_free(trie->bitmap);
21437                         if (trie->jump)
21438                             PerlMemShared_free(trie->jump);
21439                         PerlMemShared_free(trie->wordinfo);
21440                         /* do this last!!!! */
21441                         PerlMemShared_free(ri->data->data[n]);
21442                     }
21443                 }
21444                 break;
21445             default:
21446                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21447                                                     ri->data->what[n]);
21448             }
21449         }
21450         Safefree(ri->data->what);
21451         Safefree(ri->data);
21452     }
21453
21454     Safefree(ri);
21455 }
21456
21457 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21458 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21459 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
21460
21461 /*
21462    re_dup_guts - duplicate a regexp.
21463
21464    This routine is expected to clone a given regexp structure. It is only
21465    compiled under USE_ITHREADS.
21466
21467    After all of the core data stored in struct regexp is duplicated
21468    the regexp_engine.dupe method is used to copy any private data
21469    stored in the *pprivate pointer. This allows extensions to handle
21470    any duplication it needs to do.
21471
21472    See pregfree() and regfree_internal() if you change anything here.
21473 */
21474 #if defined(USE_ITHREADS)
21475 #ifndef PERL_IN_XSUB_RE
21476 void
21477 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21478 {
21479     dVAR;
21480     I32 npar;
21481     const struct regexp *r = ReANY(sstr);
21482     struct regexp *ret = ReANY(dstr);
21483
21484     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21485
21486     npar = r->nparens+1;
21487     Newx(ret->offs, npar, regexp_paren_pair);
21488     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21489
21490     if (ret->substrs) {
21491         /* Do it this way to avoid reading from *r after the StructCopy().
21492            That way, if any of the sv_dup_inc()s dislodge *r from the L1
21493            cache, it doesn't matter.  */
21494         int i;
21495         const bool anchored = r->check_substr
21496             ? r->check_substr == r->substrs->data[0].substr
21497             : r->check_utf8   == r->substrs->data[0].utf8_substr;
21498         Newx(ret->substrs, 1, struct reg_substr_data);
21499         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21500
21501         for (i = 0; i < 2; i++) {
21502             ret->substrs->data[i].substr =
21503                         sv_dup_inc(ret->substrs->data[i].substr, param);
21504             ret->substrs->data[i].utf8_substr =
21505                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21506         }
21507
21508         /* check_substr and check_utf8, if non-NULL, point to either their
21509            anchored or float namesakes, and don't hold a second reference.  */
21510
21511         if (ret->check_substr) {
21512             if (anchored) {
21513                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21514
21515                 ret->check_substr = ret->substrs->data[0].substr;
21516                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21517             } else {
21518                 assert(r->check_substr == r->substrs->data[1].substr);
21519                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21520
21521                 ret->check_substr = ret->substrs->data[1].substr;
21522                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21523             }
21524         } else if (ret->check_utf8) {
21525             if (anchored) {
21526                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21527             } else {
21528                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21529             }
21530         }
21531     }
21532
21533     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21534     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21535     if (r->recurse_locinput)
21536         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21537
21538     if (ret->pprivate)
21539         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21540
21541     if (RX_MATCH_COPIED(dstr))
21542         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21543     else
21544         ret->subbeg = NULL;
21545 #ifdef PERL_ANY_COW
21546     ret->saved_copy = NULL;
21547 #endif
21548
21549     /* Whether mother_re be set or no, we need to copy the string.  We
21550        cannot refrain from copying it when the storage points directly to
21551        our mother regexp, because that's
21552                1: a buffer in a different thread
21553                2: something we no longer hold a reference on
21554                so we need to copy it locally.  */
21555     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21556     /* set malloced length to a non-zero value so it will be freed
21557      * (otherwise in combination with SVf_FAKE it looks like an alien
21558      * buffer). It doesn't have to be the actual malloced size, since it
21559      * should never be grown */
21560     SvLEN_set(dstr, SvCUR(sstr)+1);
21561     ret->mother_re   = NULL;
21562 }
21563 #endif /* PERL_IN_XSUB_RE */
21564
21565 /*
21566    regdupe_internal()
21567
21568    This is the internal complement to regdupe() which is used to copy
21569    the structure pointed to by the *pprivate pointer in the regexp.
21570    This is the core version of the extension overridable cloning hook.
21571    The regexp structure being duplicated will be copied by perl prior
21572    to this and will be provided as the regexp *r argument, however
21573    with the /old/ structures pprivate pointer value. Thus this routine
21574    may override any copying normally done by perl.
21575
21576    It returns a pointer to the new regexp_internal structure.
21577 */
21578
21579 void *
21580 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21581 {
21582     dVAR;
21583     struct regexp *const r = ReANY(rx);
21584     regexp_internal *reti;
21585     int len;
21586     RXi_GET_DECL(r, ri);
21587
21588     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21589
21590     len = ProgLen(ri);
21591
21592     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21593           char, regexp_internal);
21594     Copy(ri->program, reti->program, len+1, regnode);
21595
21596
21597     if (ri->code_blocks) {
21598         int n;
21599         Newx(reti->code_blocks, 1, struct reg_code_blocks);
21600         Newx(reti->code_blocks->cb, ri->code_blocks->count,
21601                     struct reg_code_block);
21602         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21603              ri->code_blocks->count, struct reg_code_block);
21604         for (n = 0; n < ri->code_blocks->count; n++)
21605              reti->code_blocks->cb[n].src_regex = (REGEXP*)
21606                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21607         reti->code_blocks->count = ri->code_blocks->count;
21608         reti->code_blocks->refcnt = 1;
21609     }
21610     else
21611         reti->code_blocks = NULL;
21612
21613     reti->regstclass = NULL;
21614
21615     if (ri->data) {
21616         struct reg_data *d;
21617         const int count = ri->data->count;
21618         int i;
21619
21620         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
21621                 char, struct reg_data);
21622         Newx(d->what, count, U8);
21623
21624         d->count = count;
21625         for (i = 0; i < count; i++) {
21626             d->what[i] = ri->data->what[i];
21627             switch (d->what[i]) {
21628                 /* see also regcomp.h and regfree_internal() */
21629             case 'a': /* actually an AV, but the dup function is identical.
21630                          values seem to be "plain sv's" generally. */
21631             case 'r': /* a compiled regex (but still just another SV) */
21632             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
21633                          this use case should go away, the code could have used
21634                          'a' instead - see S_set_ANYOF_arg() for array contents. */
21635             case 'S': /* actually an SV, but the dup function is identical.  */
21636             case 'u': /* actually an HV, but the dup function is identical.
21637                          values are "plain sv's" */
21638                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
21639                 break;
21640             case 'f':
21641                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
21642                  * patterns which could start with several different things. Pre-TRIE
21643                  * this was more important than it is now, however this still helps
21644                  * in some places, for instance /x?a+/ might produce a SSC equivalent
21645                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
21646                  * in regexec.c
21647                  */
21648                 /* This is cheating. */
21649                 Newx(d->data[i], 1, regnode_ssc);
21650                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
21651                 reti->regstclass = (regnode*)d->data[i];
21652                 break;
21653             case 'T':
21654                 /* AHO-CORASICK fail table */
21655                 /* Trie stclasses are readonly and can thus be shared
21656                  * without duplication. We free the stclass in pregfree
21657                  * when the corresponding reg_ac_data struct is freed.
21658                  */
21659                 reti->regstclass= ri->regstclass;
21660                 /* FALLTHROUGH */
21661             case 't':
21662                 /* TRIE transition table */
21663                 OP_REFCNT_LOCK;
21664                 ((reg_trie_data*)ri->data->data[i])->refcount++;
21665                 OP_REFCNT_UNLOCK;
21666                 /* FALLTHROUGH */
21667             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
21668             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
21669                          is not from another regexp */
21670                 d->data[i] = ri->data->data[i];
21671                 break;
21672             default:
21673                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
21674                                                            ri->data->what[i]);
21675             }
21676         }
21677
21678         reti->data = d;
21679     }
21680     else
21681         reti->data = NULL;
21682
21683     reti->name_list_idx = ri->name_list_idx;
21684
21685 #ifdef RE_TRACK_PATTERN_OFFSETS
21686     if (ri->u.offsets) {
21687         Newx(reti->u.offsets, 2*len+1, U32);
21688         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
21689     }
21690 #else
21691     SetProgLen(reti, len);
21692 #endif
21693
21694     return (void*)reti;
21695 }
21696
21697 #endif    /* USE_ITHREADS */
21698
21699 #ifndef PERL_IN_XSUB_RE
21700
21701 /*
21702  - regnext - dig the "next" pointer out of a node
21703  */
21704 regnode *
21705 Perl_regnext(pTHX_ regnode *p)
21706 {
21707     I32 offset;
21708
21709     if (!p)
21710         return(NULL);
21711
21712     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
21713         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21714                                                 (int)OP(p), (int)REGNODE_MAX);
21715     }
21716
21717     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21718     if (offset == 0)
21719         return(NULL);
21720
21721     return(p+offset);
21722 }
21723
21724 #endif
21725
21726 STATIC void
21727 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21728 {
21729     va_list args;
21730     STRLEN l1 = strlen(pat1);
21731     STRLEN l2 = strlen(pat2);
21732     char buf[512];
21733     SV *msv;
21734     const char *message;
21735
21736     PERL_ARGS_ASSERT_RE_CROAK2;
21737
21738     if (l1 > 510)
21739         l1 = 510;
21740     if (l1 + l2 > 510)
21741         l2 = 510 - l1;
21742     Copy(pat1, buf, l1 , char);
21743     Copy(pat2, buf + l1, l2 , char);
21744     buf[l1 + l2] = '\n';
21745     buf[l1 + l2 + 1] = '\0';
21746     va_start(args, pat2);
21747     msv = vmess(buf, &args);
21748     va_end(args);
21749     message = SvPV_const(msv, l1);
21750     if (l1 > 512)
21751         l1 = 512;
21752     Copy(message, buf, l1 , char);
21753     /* l1-1 to avoid \n */
21754     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21755 }
21756
21757 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
21758
21759 #ifndef PERL_IN_XSUB_RE
21760 void
21761 Perl_save_re_context(pTHX)
21762 {
21763     I32 nparens = -1;
21764     I32 i;
21765
21766     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21767
21768     if (PL_curpm) {
21769         const REGEXP * const rx = PM_GETRE(PL_curpm);
21770         if (rx)
21771             nparens = RX_NPARENS(rx);
21772     }
21773
21774     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21775      * that PL_curpm will be null, but that utf8.pm and the modules it
21776      * loads will only use $1..$3.
21777      * The t/porting/re_context.t test file checks this assumption.
21778      */
21779     if (nparens == -1)
21780         nparens = 3;
21781
21782     for (i = 1; i <= nparens; i++) {
21783         char digits[TYPE_CHARS(long)];
21784         const STRLEN len = my_snprintf(digits, sizeof(digits),
21785                                        "%lu", (long)i);
21786         GV *const *const gvp
21787             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21788
21789         if (gvp) {
21790             GV * const gv = *gvp;
21791             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21792                 save_scalar(gv);
21793         }
21794     }
21795 }
21796 #endif
21797
21798 #ifdef DEBUGGING
21799
21800 STATIC void
21801 S_put_code_point(pTHX_ SV *sv, UV c)
21802 {
21803     PERL_ARGS_ASSERT_PUT_CODE_POINT;
21804
21805     if (c > 255) {
21806         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21807     }
21808     else if (isPRINT(c)) {
21809         const char string = (char) c;
21810
21811         /* We use {phrase} as metanotation in the class, so also escape literal
21812          * braces */
21813         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21814             sv_catpvs(sv, "\\");
21815         sv_catpvn(sv, &string, 1);
21816     }
21817     else if (isMNEMONIC_CNTRL(c)) {
21818         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21819     }
21820     else {
21821         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21822     }
21823 }
21824
21825 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21826
21827 STATIC void
21828 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21829 {
21830     /* Appends to 'sv' a displayable version of the range of code points from
21831      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
21832      * that have them, when they occur at the beginning or end of the range.
21833      * It uses hex to output the remaining code points, unless 'allow_literals'
21834      * is true, in which case the printable ASCII ones are output as-is (though
21835      * some of these will be escaped by put_code_point()).
21836      *
21837      * NOTE:  This is designed only for printing ranges of code points that fit
21838      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
21839      */
21840
21841     const unsigned int min_range_count = 3;
21842
21843     assert(start <= end);
21844
21845     PERL_ARGS_ASSERT_PUT_RANGE;
21846
21847     while (start <= end) {
21848         UV this_end;
21849         const char * format;
21850
21851         if (end - start < min_range_count) {
21852
21853             /* Output chars individually when they occur in short ranges */
21854             for (; start <= end; start++) {
21855                 put_code_point(sv, start);
21856             }
21857             break;
21858         }
21859
21860         /* If permitted by the input options, and there is a possibility that
21861          * this range contains a printable literal, look to see if there is
21862          * one. */
21863         if (allow_literals && start <= MAX_PRINT_A) {
21864
21865             /* If the character at the beginning of the range isn't an ASCII
21866              * printable, effectively split the range into two parts:
21867              *  1) the portion before the first such printable,
21868              *  2) the rest
21869              * and output them separately. */
21870             if (! isPRINT_A(start)) {
21871                 UV temp_end = start + 1;
21872
21873                 /* There is no point looking beyond the final possible
21874                  * printable, in MAX_PRINT_A */
21875                 UV max = MIN(end, MAX_PRINT_A);
21876
21877                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21878                     temp_end++;
21879                 }
21880
21881                 /* Here, temp_end points to one beyond the first printable if
21882                  * found, or to one beyond 'max' if not.  If none found, make
21883                  * sure that we use the entire range */
21884                 if (temp_end > MAX_PRINT_A) {
21885                     temp_end = end + 1;
21886                 }
21887
21888                 /* Output the first part of the split range: the part that
21889                  * doesn't have printables, with the parameter set to not look
21890                  * for literals (otherwise we would infinitely recurse) */
21891                 put_range(sv, start, temp_end - 1, FALSE);
21892
21893                 /* The 2nd part of the range (if any) starts here. */
21894                 start = temp_end;
21895
21896                 /* We do a continue, instead of dropping down, because even if
21897                  * the 2nd part is non-empty, it could be so short that we want
21898                  * to output it as individual characters, as tested for at the
21899                  * top of this loop.  */
21900                 continue;
21901             }
21902
21903             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
21904              * output a sub-range of just the digits or letters, then process
21905              * the remaining portion as usual. */
21906             if (isALPHANUMERIC_A(start)) {
21907                 UV mask = (isDIGIT_A(start))
21908                            ? _CC_DIGIT
21909                              : isUPPER_A(start)
21910                                ? _CC_UPPER
21911                                : _CC_LOWER;
21912                 UV temp_end = start + 1;
21913
21914                 /* Find the end of the sub-range that includes just the
21915                  * characters in the same class as the first character in it */
21916                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21917                     temp_end++;
21918                 }
21919                 temp_end--;
21920
21921                 /* For short ranges, don't duplicate the code above to output
21922                  * them; just call recursively */
21923                 if (temp_end - start < min_range_count) {
21924                     put_range(sv, start, temp_end, FALSE);
21925                 }
21926                 else {  /* Output as a range */
21927                     put_code_point(sv, start);
21928                     sv_catpvs(sv, "-");
21929                     put_code_point(sv, temp_end);
21930                 }
21931                 start = temp_end + 1;
21932                 continue;
21933             }
21934
21935             /* We output any other printables as individual characters */
21936             if (isPUNCT_A(start) || isSPACE_A(start)) {
21937                 while (start <= end && (isPUNCT_A(start)
21938                                         || isSPACE_A(start)))
21939                 {
21940                     put_code_point(sv, start);
21941                     start++;
21942                 }
21943                 continue;
21944             }
21945         } /* End of looking for literals */
21946
21947         /* Here is not to output as a literal.  Some control characters have
21948          * mnemonic names.  Split off any of those at the beginning and end of
21949          * the range to print mnemonically.  It isn't possible for many of
21950          * these to be in a row, so this won't overwhelm with output */
21951         if (   start <= end
21952             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21953         {
21954             while (isMNEMONIC_CNTRL(start) && start <= end) {
21955                 put_code_point(sv, start);
21956                 start++;
21957             }
21958
21959             /* If this didn't take care of the whole range ... */
21960             if (start <= end) {
21961
21962                 /* Look backwards from the end to find the final non-mnemonic
21963                  * */
21964                 UV temp_end = end;
21965                 while (isMNEMONIC_CNTRL(temp_end)) {
21966                     temp_end--;
21967                 }
21968
21969                 /* And separately output the interior range that doesn't start
21970                  * or end with mnemonics */
21971                 put_range(sv, start, temp_end, FALSE);
21972
21973                 /* Then output the mnemonic trailing controls */
21974                 start = temp_end + 1;
21975                 while (start <= end) {
21976                     put_code_point(sv, start);
21977                     start++;
21978                 }
21979                 break;
21980             }
21981         }
21982
21983         /* As a final resort, output the range or subrange as hex. */
21984
21985         if (start >= NUM_ANYOF_CODE_POINTS) {
21986             this_end = end;
21987         }
21988         else {  /* Have to split range at the bitmap boundary */
21989             this_end = (end < NUM_ANYOF_CODE_POINTS)
21990                         ? end
21991                         : NUM_ANYOF_CODE_POINTS - 1;
21992         }
21993 #if NUM_ANYOF_CODE_POINTS > 256
21994         format = (this_end < 256)
21995                  ? "\\x%02" UVXf "-\\x%02" UVXf
21996                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
21997 #else
21998         format = "\\x%02" UVXf "-\\x%02" UVXf;
21999 #endif
22000         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22001         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22002         GCC_DIAG_RESTORE_STMT;
22003         break;
22004     }
22005 }
22006
22007 STATIC void
22008 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22009 {
22010     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22011      * 'invlist' */
22012
22013     UV start, end;
22014     bool allow_literals = TRUE;
22015
22016     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22017
22018     /* Generally, it is more readable if printable characters are output as
22019      * literals, but if a range (nearly) spans all of them, it's best to output
22020      * it as a single range.  This code will use a single range if all but 2
22021      * ASCII printables are in it */
22022     invlist_iterinit(invlist);
22023     while (invlist_iternext(invlist, &start, &end)) {
22024
22025         /* If the range starts beyond the final printable, it doesn't have any
22026          * in it */
22027         if (start > MAX_PRINT_A) {
22028             break;
22029         }
22030
22031         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
22032          * all but two, the range must start and end no later than 2 from
22033          * either end */
22034         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22035             if (end > MAX_PRINT_A) {
22036                 end = MAX_PRINT_A;
22037             }
22038             if (start < ' ') {
22039                 start = ' ';
22040             }
22041             if (end - start >= MAX_PRINT_A - ' ' - 2) {
22042                 allow_literals = FALSE;
22043             }
22044             break;
22045         }
22046     }
22047     invlist_iterfinish(invlist);
22048
22049     /* Here we have figured things out.  Output each range */
22050     invlist_iterinit(invlist);
22051     while (invlist_iternext(invlist, &start, &end)) {
22052         if (start >= NUM_ANYOF_CODE_POINTS) {
22053             break;
22054         }
22055         put_range(sv, start, end, allow_literals);
22056     }
22057     invlist_iterfinish(invlist);
22058
22059     return;
22060 }
22061
22062 STATIC SV*
22063 S_put_charclass_bitmap_innards_common(pTHX_
22064         SV* invlist,            /* The bitmap */
22065         SV* posixes,            /* Under /l, things like [:word:], \S */
22066         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
22067         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
22068         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
22069         const bool invert       /* Is the result to be inverted? */
22070 )
22071 {
22072     /* Create and return an SV containing a displayable version of the bitmap
22073      * and associated information determined by the input parameters.  If the
22074      * output would have been only the inversion indicator '^', NULL is instead
22075      * returned. */
22076
22077     dVAR;
22078     SV * output;
22079
22080     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22081
22082     if (invert) {
22083         output = newSVpvs("^");
22084     }
22085     else {
22086         output = newSVpvs("");
22087     }
22088
22089     /* First, the code points in the bitmap that are unconditionally there */
22090     put_charclass_bitmap_innards_invlist(output, invlist);
22091
22092     /* Traditionally, these have been placed after the main code points */
22093     if (posixes) {
22094         sv_catsv(output, posixes);
22095     }
22096
22097     if (only_utf8 && _invlist_len(only_utf8)) {
22098         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22099         put_charclass_bitmap_innards_invlist(output, only_utf8);
22100     }
22101
22102     if (not_utf8 && _invlist_len(not_utf8)) {
22103         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22104         put_charclass_bitmap_innards_invlist(output, not_utf8);
22105     }
22106
22107     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22108         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22109         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22110
22111         /* This is the only list in this routine that can legally contain code
22112          * points outside the bitmap range.  The call just above to
22113          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22114          * output them here.  There's about a half-dozen possible, and none in
22115          * contiguous ranges longer than 2 */
22116         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22117             UV start, end;
22118             SV* above_bitmap = NULL;
22119
22120             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22121
22122             invlist_iterinit(above_bitmap);
22123             while (invlist_iternext(above_bitmap, &start, &end)) {
22124                 UV i;
22125
22126                 for (i = start; i <= end; i++) {
22127                     put_code_point(output, i);
22128                 }
22129             }
22130             invlist_iterfinish(above_bitmap);
22131             SvREFCNT_dec_NN(above_bitmap);
22132         }
22133     }
22134
22135     if (invert && SvCUR(output) == 1) {
22136         return NULL;
22137     }
22138
22139     return output;
22140 }
22141
22142 STATIC bool
22143 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22144                                      char *bitmap,
22145                                      SV *nonbitmap_invlist,
22146                                      SV *only_utf8_locale_invlist,
22147                                      const regnode * const node,
22148                                      const U8 flags,
22149                                      const bool force_as_is_display)
22150 {
22151     /* Appends to 'sv' a displayable version of the innards of the bracketed
22152      * character class defined by the other arguments:
22153      *  'bitmap' points to the bitmap, or NULL if to ignore that.
22154      *  'nonbitmap_invlist' is an inversion list of the code points that are in
22155      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
22156      *      none.  The reasons for this could be that they require some
22157      *      condition such as the target string being or not being in UTF-8
22158      *      (under /d), or because they came from a user-defined property that
22159      *      was not resolved at the time of the regex compilation (under /u)
22160      *  'only_utf8_locale_invlist' is an inversion list of the code points that
22161      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
22162      *  'node' is the regex pattern ANYOF node.  It is needed only when the
22163      *      above two parameters are not null, and is passed so that this
22164      *      routine can tease apart the various reasons for them.
22165      *  'flags' is the flags field of 'node'
22166      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
22167      *      to invert things to see if that leads to a cleaner display.  If
22168      *      FALSE, this routine is free to use its judgment about doing this.
22169      *
22170      * It returns TRUE if there was actually something output.  (It may be that
22171      * the bitmap, etc is empty.)
22172      *
22173      * When called for outputting the bitmap of a non-ANYOF node, just pass the
22174      * bitmap, with the succeeding parameters set to NULL, and the final one to
22175      * FALSE.
22176      */
22177
22178     /* In general, it tries to display the 'cleanest' representation of the
22179      * innards, choosing whether to display them inverted or not, regardless of
22180      * whether the class itself is to be inverted.  However,  there are some
22181      * cases where it can't try inverting, as what actually matches isn't known
22182      * until runtime, and hence the inversion isn't either. */
22183
22184     dVAR;
22185     bool inverting_allowed = ! force_as_is_display;
22186
22187     int i;
22188     STRLEN orig_sv_cur = SvCUR(sv);
22189
22190     SV* invlist;            /* Inversion list we accumulate of code points that
22191                                are unconditionally matched */
22192     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
22193                                UTF-8 */
22194     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
22195                              */
22196     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
22197     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
22198                                        is UTF-8 */
22199
22200     SV* as_is_display;      /* The output string when we take the inputs
22201                                literally */
22202     SV* inverted_display;   /* The output string when we invert the inputs */
22203
22204     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
22205                                                    to match? */
22206     /* We are biased in favor of displaying things without them being inverted,
22207      * as that is generally easier to understand */
22208     const int bias = 5;
22209
22210     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22211
22212     /* Start off with whatever code points are passed in.  (We clone, so we
22213      * don't change the caller's list) */
22214     if (nonbitmap_invlist) {
22215         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22216         invlist = invlist_clone(nonbitmap_invlist, NULL);
22217     }
22218     else {  /* Worst case size is every other code point is matched */
22219         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22220     }
22221
22222     if (flags) {
22223         if (OP(node) == ANYOFD) {
22224
22225             /* This flag indicates that the code points below 0x100 in the
22226              * nonbitmap list are precisely the ones that match only when the
22227              * target is UTF-8 (they should all be non-ASCII). */
22228             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22229             {
22230                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22231                 _invlist_subtract(invlist, only_utf8, &invlist);
22232             }
22233
22234             /* And this flag for matching all non-ASCII 0xFF and below */
22235             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22236             {
22237                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22238             }
22239         }
22240         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22241
22242             /* If either of these flags are set, what matches isn't
22243              * determinable except during execution, so don't know enough here
22244              * to invert */
22245             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22246                 inverting_allowed = FALSE;
22247             }
22248
22249             /* What the posix classes match also varies at runtime, so these
22250              * will be output symbolically. */
22251             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22252                 int i;
22253
22254                 posixes = newSVpvs("");
22255                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22256                     if (ANYOF_POSIXL_TEST(node, i)) {
22257                         sv_catpv(posixes, anyofs[i]);
22258                     }
22259                 }
22260             }
22261         }
22262     }
22263
22264     /* Accumulate the bit map into the unconditional match list */
22265     if (bitmap) {
22266         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22267             if (BITMAP_TEST(bitmap, i)) {
22268                 int start = i++;
22269                 for (;
22270                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22271                      i++)
22272                 { /* empty */ }
22273                 invlist = _add_range_to_invlist(invlist, start, i-1);
22274             }
22275         }
22276     }
22277
22278     /* Make sure that the conditional match lists don't have anything in them
22279      * that match unconditionally; otherwise the output is quite confusing.
22280      * This could happen if the code that populates these misses some
22281      * duplication. */
22282     if (only_utf8) {
22283         _invlist_subtract(only_utf8, invlist, &only_utf8);
22284     }
22285     if (not_utf8) {
22286         _invlist_subtract(not_utf8, invlist, &not_utf8);
22287     }
22288
22289     if (only_utf8_locale_invlist) {
22290
22291         /* Since this list is passed in, we have to make a copy before
22292          * modifying it */
22293         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22294
22295         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22296
22297         /* And, it can get really weird for us to try outputting an inverted
22298          * form of this list when it has things above the bitmap, so don't even
22299          * try */
22300         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22301             inverting_allowed = FALSE;
22302         }
22303     }
22304
22305     /* Calculate what the output would be if we take the input as-is */
22306     as_is_display = put_charclass_bitmap_innards_common(invlist,
22307                                                     posixes,
22308                                                     only_utf8,
22309                                                     not_utf8,
22310                                                     only_utf8_locale,
22311                                                     invert);
22312
22313     /* If have to take the output as-is, just do that */
22314     if (! inverting_allowed) {
22315         if (as_is_display) {
22316             sv_catsv(sv, as_is_display);
22317             SvREFCNT_dec_NN(as_is_display);
22318         }
22319     }
22320     else { /* But otherwise, create the output again on the inverted input, and
22321               use whichever version is shorter */
22322
22323         int inverted_bias, as_is_bias;
22324
22325         /* We will apply our bias to whichever of the the results doesn't have
22326          * the '^' */
22327         if (invert) {
22328             invert = FALSE;
22329             as_is_bias = bias;
22330             inverted_bias = 0;
22331         }
22332         else {
22333             invert = TRUE;
22334             as_is_bias = 0;
22335             inverted_bias = bias;
22336         }
22337
22338         /* Now invert each of the lists that contribute to the output,
22339          * excluding from the result things outside the possible range */
22340
22341         /* For the unconditional inversion list, we have to add in all the
22342          * conditional code points, so that when inverted, they will be gone
22343          * from it */
22344         _invlist_union(only_utf8, invlist, &invlist);
22345         _invlist_union(not_utf8, invlist, &invlist);
22346         _invlist_union(only_utf8_locale, invlist, &invlist);
22347         _invlist_invert(invlist);
22348         _invlist_intersection(invlist, PL_InBitmap, &invlist);
22349
22350         if (only_utf8) {
22351             _invlist_invert(only_utf8);
22352             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22353         }
22354         else if (not_utf8) {
22355
22356             /* If a code point matches iff the target string is not in UTF-8,
22357              * then complementing the result has it not match iff not in UTF-8,
22358              * which is the same thing as matching iff it is UTF-8. */
22359             only_utf8 = not_utf8;
22360             not_utf8 = NULL;
22361         }
22362
22363         if (only_utf8_locale) {
22364             _invlist_invert(only_utf8_locale);
22365             _invlist_intersection(only_utf8_locale,
22366                                   PL_InBitmap,
22367                                   &only_utf8_locale);
22368         }
22369
22370         inverted_display = put_charclass_bitmap_innards_common(
22371                                             invlist,
22372                                             posixes,
22373                                             only_utf8,
22374                                             not_utf8,
22375                                             only_utf8_locale, invert);
22376
22377         /* Use the shortest representation, taking into account our bias
22378          * against showing it inverted */
22379         if (   inverted_display
22380             && (   ! as_is_display
22381                 || (  SvCUR(inverted_display) + inverted_bias
22382                     < SvCUR(as_is_display)    + as_is_bias)))
22383         {
22384             sv_catsv(sv, inverted_display);
22385         }
22386         else if (as_is_display) {
22387             sv_catsv(sv, as_is_display);
22388         }
22389
22390         SvREFCNT_dec(as_is_display);
22391         SvREFCNT_dec(inverted_display);
22392     }
22393
22394     SvREFCNT_dec_NN(invlist);
22395     SvREFCNT_dec(only_utf8);
22396     SvREFCNT_dec(not_utf8);
22397     SvREFCNT_dec(posixes);
22398     SvREFCNT_dec(only_utf8_locale);
22399
22400     return SvCUR(sv) > orig_sv_cur;
22401 }
22402
22403 #define CLEAR_OPTSTART                                                       \
22404     if (optstart) STMT_START {                                               \
22405         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
22406                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22407         optstart=NULL;                                                       \
22408     } STMT_END
22409
22410 #define DUMPUNTIL(b,e)                                                       \
22411                     CLEAR_OPTSTART;                                          \
22412                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22413
22414 STATIC const regnode *
22415 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22416             const regnode *last, const regnode *plast,
22417             SV* sv, I32 indent, U32 depth)
22418 {
22419     U8 op = PSEUDO;     /* Arbitrary non-END op. */
22420     const regnode *next;
22421     const regnode *optstart= NULL;
22422
22423     RXi_GET_DECL(r, ri);
22424     GET_RE_DEBUG_FLAGS_DECL;
22425
22426     PERL_ARGS_ASSERT_DUMPUNTIL;
22427
22428 #ifdef DEBUG_DUMPUNTIL
22429     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22430         last ? last-start : 0, plast ? plast-start : 0);
22431 #endif
22432
22433     if (plast && plast < last)
22434         last= plast;
22435
22436     while (PL_regkind[op] != END && (!last || node < last)) {
22437         assert(node);
22438         /* While that wasn't END last time... */
22439         NODE_ALIGN(node);
22440         op = OP(node);
22441         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22442             indent--;
22443         next = regnext((regnode *)node);
22444
22445         /* Where, what. */
22446         if (OP(node) == OPTIMIZED) {
22447             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22448                 optstart = node;
22449             else
22450                 goto after_print;
22451         } else
22452             CLEAR_OPTSTART;
22453
22454         regprop(r, sv, node, NULL, NULL);
22455         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22456                       (int)(2*indent + 1), "", SvPVX_const(sv));
22457
22458         if (OP(node) != OPTIMIZED) {
22459             if (next == NULL)           /* Next ptr. */
22460                 Perl_re_printf( aTHX_  " (0)");
22461             else if (PL_regkind[(U8)op] == BRANCH
22462                      && PL_regkind[OP(next)] != BRANCH )
22463                 Perl_re_printf( aTHX_  " (FAIL)");
22464             else
22465                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22466             Perl_re_printf( aTHX_ "\n");
22467         }
22468
22469       after_print:
22470         if (PL_regkind[(U8)op] == BRANCHJ) {
22471             assert(next);
22472             {
22473                 const regnode *nnode = (OP(next) == LONGJMP
22474                                        ? regnext((regnode *)next)
22475                                        : next);
22476                 if (last && nnode > last)
22477                     nnode = last;
22478                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22479             }
22480         }
22481         else if (PL_regkind[(U8)op] == BRANCH) {
22482             assert(next);
22483             DUMPUNTIL(NEXTOPER(node), next);
22484         }
22485         else if ( PL_regkind[(U8)op]  == TRIE ) {
22486             const regnode *this_trie = node;
22487             const char op = OP(node);
22488             const U32 n = ARG(node);
22489             const reg_ac_data * const ac = op>=AHOCORASICK ?
22490                (reg_ac_data *)ri->data->data[n] :
22491                NULL;
22492             const reg_trie_data * const trie =
22493                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22494 #ifdef DEBUGGING
22495             AV *const trie_words
22496                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22497 #endif
22498             const regnode *nextbranch= NULL;
22499             I32 word_idx;
22500             SvPVCLEAR(sv);
22501             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22502                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22503
22504                 Perl_re_indentf( aTHX_  "%s ",
22505                     indent+3,
22506                     elem_ptr
22507                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22508                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22509                                 PL_colors[0], PL_colors[1],
22510                                 (SvUTF8(*elem_ptr)
22511                                  ? PERL_PV_ESCAPE_UNI
22512                                  : 0)
22513                                 | PERL_PV_PRETTY_ELLIPSES
22514                                 | PERL_PV_PRETTY_LTGT
22515                             )
22516                     : "???"
22517                 );
22518                 if (trie->jump) {
22519                     U16 dist= trie->jump[word_idx+1];
22520                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22521                                (UV)((dist ? this_trie + dist : next) - start));
22522                     if (dist) {
22523                         if (!nextbranch)
22524                             nextbranch= this_trie + trie->jump[0];
22525                         DUMPUNTIL(this_trie + dist, nextbranch);
22526                     }
22527                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22528                         nextbranch= regnext((regnode *)nextbranch);
22529                 } else {
22530                     Perl_re_printf( aTHX_  "\n");
22531                 }
22532             }
22533             if (last && next > last)
22534                 node= last;
22535             else
22536                 node= next;
22537         }
22538         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22539             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22540                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22541         }
22542         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22543             assert(next);
22544             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22545         }
22546         else if ( op == PLUS || op == STAR) {
22547             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22548         }
22549         else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22550             /* Literal string, where present. */
22551             node += NODE_SZ_STR(node) - 1;
22552             node = NEXTOPER(node);
22553         }
22554         else {
22555             node = NEXTOPER(node);
22556             node += regarglen[(U8)op];
22557         }
22558         if (op == CURLYX || op == OPEN || op == SROPEN)
22559             indent++;
22560     }
22561     CLEAR_OPTSTART;
22562 #ifdef DEBUG_DUMPUNTIL
22563     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22564 #endif
22565     return node;
22566 }
22567
22568 #endif  /* DEBUGGING */
22569
22570 #ifndef PERL_IN_XSUB_RE
22571
22572 #include "uni_keywords.h"
22573
22574 void
22575 Perl_init_uniprops(pTHX)
22576 {
22577     dVAR;
22578
22579 #ifdef DEBUGGING
22580     char * dump_len_string;
22581
22582     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22583     if (   ! dump_len_string
22584         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22585     {
22586         PL_dump_re_max_len = 60;    /* A reasonable default */
22587     }
22588 #endif
22589
22590     PL_user_def_props = newHV();
22591
22592 #ifdef USE_ITHREADS
22593
22594     HvSHAREKEYS_off(PL_user_def_props);
22595     PL_user_def_props_aTHX = aTHX;
22596
22597 #endif
22598
22599     /* Set up the inversion list interpreter-level variables */
22600
22601     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22602     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22603     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22604     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22605     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22606     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22607     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22608     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22609     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22610     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22611     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22612     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22613     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22614     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22615     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22616     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22617
22618     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22619     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22620     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22621     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22622     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22623     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
22624     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
22625     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
22626     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
22627     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
22628     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
22629     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
22630     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
22631     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
22632     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
22633     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
22634
22635     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
22636     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
22637     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
22638     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
22639     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
22640
22641     PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist);
22642     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
22643     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
22644     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
22645
22646     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
22647
22648     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
22649     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
22650
22651     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
22652     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
22653
22654     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
22655     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22656                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
22657     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22658                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
22659     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
22660     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
22661     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
22662     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
22663     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
22664     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
22665     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
22666     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
22667     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
22668
22669 #ifdef UNI_XIDC
22670     /* The below are used only by deprecated functions.  They could be removed */
22671     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
22672     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
22673     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
22674 #endif
22675 }
22676
22677 #if 0
22678
22679 This code was mainly added for backcompat to give a warning for non-portable
22680 code points in user-defined properties.  But experiments showed that the
22681 warning in earlier perls were only omitted on overflow, which should be an
22682 error, so there really isnt a backcompat issue, and actually adding the
22683 warning when none was present before might cause breakage, for little gain.  So
22684 khw left this code in, but not enabled.  Tests were never added.
22685
22686 embed.fnc entry:
22687 Ei      |const char *|get_extended_utf8_msg|const UV cp
22688
22689 PERL_STATIC_INLINE const char *
22690 S_get_extended_utf8_msg(pTHX_ const UV cp)
22691 {
22692     U8 dummy[UTF8_MAXBYTES + 1];
22693     HV *msgs;
22694     SV **msg;
22695
22696     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
22697                              &msgs);
22698
22699     msg = hv_fetchs(msgs, "text", 0);
22700     assert(msg);
22701
22702     (void) sv_2mortal((SV *) msgs);
22703
22704     return SvPVX(*msg);
22705 }
22706
22707 #endif
22708
22709 SV *
22710 Perl_handle_user_defined_property(pTHX_
22711
22712     /* Parses the contents of a user-defined property definition; returning the
22713      * expanded definition if possible.  If so, the return is an inversion
22714      * list.
22715      *
22716      * If there are subroutines that are part of the expansion and which aren't
22717      * known at the time of the call to this function, this returns what
22718      * parse_uniprop_string() returned for the first one encountered.
22719      *
22720      * If an error was found, NULL is returned, and 'msg' gets a suitable
22721      * message appended to it.  (Appending allows the back trace of how we got
22722      * to the faulty definition to be displayed through nested calls of
22723      * user-defined subs.)
22724      *
22725      * The caller IS responsible for freeing any returned SV.
22726      *
22727      * The syntax of the contents is pretty much described in perlunicode.pod,
22728      * but we also allow comments on each line */
22729
22730     const char * name,          /* Name of property */
22731     const STRLEN name_len,      /* The name's length in bytes */
22732     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22733     const bool to_fold,         /* ? Is this under /i */
22734     const bool runtime,         /* ? Are we in compile- or run-time */
22735     const bool deferrable,      /* Is it ok for this property's full definition
22736                                    to be deferred until later? */
22737     SV* contents,               /* The property's definition */
22738     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
22739                                    getting called unless this is thought to be
22740                                    a user-defined property */
22741     SV * msg,                   /* Any error or warning msg(s) are appended to
22742                                    this */
22743     const STRLEN level)         /* Recursion level of this call */
22744 {
22745     STRLEN len;
22746     const char * string         = SvPV_const(contents, len);
22747     const char * const e        = string + len;
22748     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
22749     const STRLEN msgs_length_on_entry = SvCUR(msg);
22750
22751     const char * s0 = string;   /* Points to first byte in the current line
22752                                    being parsed in 'string' */
22753     const char overflow_msg[] = "Code point too large in \"";
22754     SV* running_definition = NULL;
22755
22756     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
22757
22758     *user_defined_ptr = TRUE;
22759
22760     /* Look at each line */
22761     while (s0 < e) {
22762         const char * s;     /* Current byte */
22763         char op = '+';      /* Default operation is 'union' */
22764         IV   min = 0;       /* range begin code point */
22765         IV   max = -1;      /* and range end */
22766         SV* this_definition;
22767
22768         /* Skip comment lines */
22769         if (*s0 == '#') {
22770             s0 = strchr(s0, '\n');
22771             if (s0 == NULL) {
22772                 break;
22773             }
22774             s0++;
22775             continue;
22776         }
22777
22778         /* For backcompat, allow an empty first line */
22779         if (*s0 == '\n') {
22780             s0++;
22781             continue;
22782         }
22783
22784         /* First character in the line may optionally be the operation */
22785         if (   *s0 == '+'
22786             || *s0 == '!'
22787             || *s0 == '-'
22788             || *s0 == '&')
22789         {
22790             op = *s0++;
22791         }
22792
22793         /* If the line is one or two hex digits separated by blank space, its
22794          * a range; otherwise it is either another user-defined property or an
22795          * error */
22796
22797         s = s0;
22798
22799         if (! isXDIGIT(*s)) {
22800             goto check_if_property;
22801         }
22802
22803         do { /* Each new hex digit will add 4 bits. */
22804             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
22805                 s = strchr(s, '\n');
22806                 if (s == NULL) {
22807                     s = e;
22808                 }
22809                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22810                 sv_catpv(msg, overflow_msg);
22811                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22812                                      UTF8fARG(is_contents_utf8, s - s0, s0));
22813                 sv_catpvs(msg, "\"");
22814                 goto return_failure;
22815             }
22816
22817             /* Accumulate this digit into the value */
22818             min = (min << 4) + READ_XDIGIT(s);
22819         } while (isXDIGIT(*s));
22820
22821         while (isBLANK(*s)) { s++; }
22822
22823         /* We allow comments at the end of the line */
22824         if (*s == '#') {
22825             s = strchr(s, '\n');
22826             if (s == NULL) {
22827                 s = e;
22828             }
22829             s++;
22830         }
22831         else if (s < e && *s != '\n') {
22832             if (! isXDIGIT(*s)) {
22833                 goto check_if_property;
22834             }
22835
22836             /* Look for the high point of the range */
22837             max = 0;
22838             do {
22839                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
22840                     s = strchr(s, '\n');
22841                     if (s == NULL) {
22842                         s = e;
22843                     }
22844                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22845                     sv_catpv(msg, overflow_msg);
22846                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22847                                       UTF8fARG(is_contents_utf8, s - s0, s0));
22848                     sv_catpvs(msg, "\"");
22849                     goto return_failure;
22850                 }
22851
22852                 max = (max << 4) + READ_XDIGIT(s);
22853             } while (isXDIGIT(*s));
22854
22855             while (isBLANK(*s)) { s++; }
22856
22857             if (*s == '#') {
22858                 s = strchr(s, '\n');
22859                 if (s == NULL) {
22860                     s = e;
22861                 }
22862             }
22863             else if (s < e && *s != '\n') {
22864                 goto check_if_property;
22865             }
22866         }
22867
22868         if (max == -1) {    /* The line only had one entry */
22869             max = min;
22870         }
22871         else if (max < min) {
22872             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22873             sv_catpvs(msg, "Illegal range in \"");
22874             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22875                                 UTF8fARG(is_contents_utf8, s - s0, s0));
22876             sv_catpvs(msg, "\"");
22877             goto return_failure;
22878         }
22879
22880 #if 0   /* See explanation at definition above of get_extended_utf8_msg() */
22881
22882         if (   UNICODE_IS_PERL_EXTENDED(min)
22883             || UNICODE_IS_PERL_EXTENDED(max))
22884         {
22885             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22886
22887             /* If both code points are non-portable, warn only on the lower
22888              * one. */
22889             sv_catpv(msg, get_extended_utf8_msg(
22890                                             (UNICODE_IS_PERL_EXTENDED(min))
22891                                             ? min : max));
22892             sv_catpvs(msg, " in \"");
22893             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22894                                  UTF8fARG(is_contents_utf8, s - s0, s0));
22895             sv_catpvs(msg, "\"");
22896         }
22897
22898 #endif
22899
22900         /* Here, this line contains a legal range */
22901         this_definition = sv_2mortal(_new_invlist(2));
22902         this_definition = _add_range_to_invlist(this_definition, min, max);
22903         goto calculate;
22904
22905       check_if_property:
22906
22907         /* Here it isn't a legal range line.  See if it is a legal property
22908          * line.  First find the end of the meat of the line */
22909         s = strpbrk(s, "#\n");
22910         if (s == NULL) {
22911             s = e;
22912         }
22913
22914         /* Ignore trailing blanks in keeping with the requirements of
22915          * parse_uniprop_string() */
22916         s--;
22917         while (s > s0 && isBLANK_A(*s)) {
22918             s--;
22919         }
22920         s++;
22921
22922         this_definition = parse_uniprop_string(s0, s - s0,
22923                                                is_utf8, to_fold, runtime,
22924                                                deferrable,
22925                                                user_defined_ptr, msg,
22926                                                (name_len == 0)
22927                                                 ? level /* Don't increase level
22928                                                            if input is empty */
22929                                                 : level + 1
22930                                               );
22931         if (this_definition == NULL) {
22932             goto return_failure;    /* 'msg' should have had the reason
22933                                        appended to it by the above call */
22934         }
22935
22936         if (! is_invlist(this_definition)) {    /* Unknown at this time */
22937             return newSVsv(this_definition);
22938         }
22939
22940         if (*s != '\n') {
22941             s = strchr(s, '\n');
22942             if (s == NULL) {
22943                 s = e;
22944             }
22945         }
22946
22947       calculate:
22948
22949         switch (op) {
22950             case '+':
22951                 _invlist_union(running_definition, this_definition,
22952                                                         &running_definition);
22953                 break;
22954             case '-':
22955                 _invlist_subtract(running_definition, this_definition,
22956                                                         &running_definition);
22957                 break;
22958             case '&':
22959                 _invlist_intersection(running_definition, this_definition,
22960                                                         &running_definition);
22961                 break;
22962             case '!':
22963                 _invlist_union_complement_2nd(running_definition,
22964                                         this_definition, &running_definition);
22965                 break;
22966             default:
22967                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
22968                                  __FILE__, __LINE__, op);
22969                 break;
22970         }
22971
22972         /* Position past the '\n' */
22973         s0 = s + 1;
22974     }   /* End of loop through the lines of 'contents' */
22975
22976     /* Here, we processed all the lines in 'contents' without error.  If we
22977      * didn't add any warnings, simply return success */
22978     if (msgs_length_on_entry == SvCUR(msg)) {
22979
22980         /* If the expansion was empty, the answer isn't nothing: its an empty
22981          * inversion list */
22982         if (running_definition == NULL) {
22983             running_definition = _new_invlist(1);
22984         }
22985
22986         return running_definition;
22987     }
22988
22989     /* Otherwise, add some explanatory text, but we will return success */
22990     goto return_msg;
22991
22992   return_failure:
22993     running_definition = NULL;
22994
22995   return_msg:
22996
22997     if (name_len > 0) {
22998         sv_catpvs(msg, " in expansion of ");
22999         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23000     }
23001
23002     return running_definition;
23003 }
23004
23005 /* As explained below, certain operations need to take place in the first
23006  * thread created.  These macros switch contexts */
23007 #ifdef USE_ITHREADS
23008 #  define DECLARATION_FOR_GLOBAL_CONTEXT                                    \
23009                                         PerlInterpreter * save_aTHX = aTHX;
23010 #  define SWITCH_TO_GLOBAL_CONTEXT                                          \
23011                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23012 #  define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
23013 #  define CUR_CONTEXT      aTHX
23014 #  define ORIGINAL_CONTEXT save_aTHX
23015 #else
23016 #  define DECLARATION_FOR_GLOBAL_CONTEXT
23017 #  define SWITCH_TO_GLOBAL_CONTEXT          NOOP
23018 #  define RESTORE_CONTEXT                   NOOP
23019 #  define CUR_CONTEXT                       NULL
23020 #  define ORIGINAL_CONTEXT                  NULL
23021 #endif
23022
23023 STATIC void
23024 S_delete_recursion_entry(pTHX_ void *key)
23025 {
23026     /* Deletes the entry used to detect recursion when expanding user-defined
23027      * properties.  This is a function so it can be set up to be called even if
23028      * the program unexpectedly quits */
23029
23030     dVAR;
23031     SV ** current_entry;
23032     const STRLEN key_len = strlen((const char *) key);
23033     DECLARATION_FOR_GLOBAL_CONTEXT;
23034
23035     SWITCH_TO_GLOBAL_CONTEXT;
23036
23037     /* If the entry is one of these types, it is a permanent entry, and not the
23038      * one used to detect recursions.  This function should delete only the
23039      * recursion entry */
23040     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23041     if (     current_entry
23042         && ! is_invlist(*current_entry)
23043         && ! SvPOK(*current_entry))
23044     {
23045         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23046                                                                     G_DISCARD);
23047     }
23048
23049     RESTORE_CONTEXT;
23050 }
23051
23052 STATIC SV *
23053 S_get_fq_name(pTHX_
23054               const char * const name,    /* The first non-blank in the \p{}, \P{} */
23055               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
23056               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23057               const bool has_colon_colon
23058              )
23059 {
23060     /* Returns a mortal SV containing the fully qualified version of the input
23061      * name */
23062
23063     SV * fq_name;
23064
23065     fq_name = newSVpvs_flags("", SVs_TEMP);
23066
23067     /* Use the current package if it wasn't included in our input */
23068     if (! has_colon_colon) {
23069         const HV * pkg = (IN_PERL_COMPILETIME)
23070                          ? PL_curstash
23071                          : CopSTASH(PL_curcop);
23072         const char* pkgname = HvNAME(pkg);
23073
23074         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23075                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23076         sv_catpvs(fq_name, "::");
23077     }
23078
23079     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23080                          UTF8fARG(is_utf8, name_len, name));
23081     return fq_name;
23082 }
23083
23084 SV *
23085 Perl_parse_uniprop_string(pTHX_
23086
23087     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
23088      * now.  If so, the return is an inversion list.
23089      *
23090      * If the property is user-defined, it is a subroutine, which in turn
23091      * may call other subroutines.  This function will call the whole nest of
23092      * them to get the definition they return; if some aren't known at the time
23093      * of the call to this function, the fully qualified name of the highest
23094      * level sub is returned.  It is an error to call this function at runtime
23095      * without every sub defined.
23096      *
23097      * If an error was found, NULL is returned, and 'msg' gets a suitable
23098      * message appended to it.  (Appending allows the back trace of how we got
23099      * to the faulty definition to be displayed through nested calls of
23100      * user-defined subs.)
23101      *
23102      * The caller should NOT try to free any returned inversion list.
23103      *
23104      * Other parameters will be set on return as described below */
23105
23106     const char * const name,    /* The first non-blank in the \p{}, \P{} */
23107     const Size_t name_len,      /* Its length in bytes, not including any
23108                                    trailing space */
23109     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23110     const bool to_fold,         /* ? Is this under /i */
23111     const bool runtime,         /* TRUE if this is being called at run time */
23112     const bool deferrable,      /* TRUE if it's ok for the definition to not be
23113                                    known at this call */
23114     bool *user_defined_ptr,     /* Upon return from this function it will be
23115                                    set to TRUE if any component is a
23116                                    user-defined property */
23117     SV * msg,                   /* Any error or warning msg(s) are appended to
23118                                    this */
23119    const STRLEN level)          /* Recursion level of this call */
23120 {
23121     dVAR;
23122     char* lookup_name;          /* normalized name for lookup in our tables */
23123     unsigned lookup_len;        /* Its length */
23124     bool stricter = FALSE;      /* Some properties have stricter name
23125                                    normalization rules, which we decide upon
23126                                    based on parsing */
23127
23128     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23129      * (though it requires extra effort to download them from Unicode and
23130      * compile perl to know about them) */
23131     bool is_nv_type = FALSE;
23132
23133     unsigned int i, j = 0;
23134     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
23135     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
23136     int table_index = 0;    /* The entry number for this property in the table
23137                                of all Unicode property names */
23138     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
23139     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
23140                                    the normalized name in certain situations */
23141     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
23142                                    part of a package name */
23143     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
23144                                              property rather than a Unicode
23145                                              one. */
23146     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
23147                                      if an error.  If it is an inversion list,
23148                                      it is the definition.  Otherwise it is a
23149                                      string containing the fully qualified sub
23150                                      name of 'name' */
23151     SV * fq_name = NULL;        /* For user-defined properties, the fully
23152                                    qualified name */
23153     bool invert_return = FALSE; /* ? Do we need to complement the result before
23154                                      returning it */
23155
23156     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23157
23158     /* The input will be normalized into 'lookup_name' */
23159     Newx(lookup_name, name_len, char);
23160     SAVEFREEPV(lookup_name);
23161
23162     /* Parse the input. */
23163     for (i = 0; i < name_len; i++) {
23164         char cur = name[i];
23165
23166         /* Most of the characters in the input will be of this ilk, being parts
23167          * of a name */
23168         if (isIDCONT_A(cur)) {
23169
23170             /* Case differences are ignored.  Our lookup routine assumes
23171              * everything is lowercase, so normalize to that */
23172             if (isUPPER_A(cur)) {
23173                 lookup_name[j++] = toLOWER_A(cur);
23174                 continue;
23175             }
23176
23177             if (cur == '_') { /* Don't include these in the normalized name */
23178                 continue;
23179             }
23180
23181             lookup_name[j++] = cur;
23182
23183             /* The first character in a user-defined name must be of this type.
23184              * */
23185             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23186                 could_be_user_defined = FALSE;
23187             }
23188
23189             continue;
23190         }
23191
23192         /* Here, the character is not something typically in a name,  But these
23193          * two types of characters (and the '_' above) can be freely ignored in
23194          * most situations.  Later it may turn out we shouldn't have ignored
23195          * them, and we have to reparse, but we don't have enough information
23196          * yet to make that decision */
23197         if (cur == '-' || isSPACE_A(cur)) {
23198             could_be_user_defined = FALSE;
23199             continue;
23200         }
23201
23202         /* An equals sign or single colon mark the end of the first part of
23203          * the property name */
23204         if (    cur == '='
23205             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23206         {
23207             lookup_name[j++] = '='; /* Treat the colon as an '=' */
23208             equals_pos = j; /* Note where it occurred in the input */
23209             could_be_user_defined = FALSE;
23210             break;
23211         }
23212
23213         /* Otherwise, this character is part of the name. */
23214         lookup_name[j++] = cur;
23215
23216         /* Here it isn't a single colon, so if it is a colon, it must be a
23217          * double colon */
23218         if (cur == ':') {
23219
23220             /* A double colon should be a package qualifier.  We note its
23221              * position and continue.  Note that one could have
23222              *      pkg1::pkg2::...::foo
23223              * so that the position at the end of the loop will be just after
23224              * the final qualifier */
23225
23226             i++;
23227             non_pkg_begin = i + 1;
23228             lookup_name[j++] = ':';
23229         }
23230         else { /* Only word chars (and '::') can be in a user-defined name */
23231             could_be_user_defined = FALSE;
23232         }
23233     } /* End of parsing through the lhs of the property name (or all of it if
23234          no rhs) */
23235
23236 #define STRLENs(s)  (sizeof("" s "") - 1)
23237
23238     /* If there is a single package name 'utf8::', it is ambiguous.  It could
23239      * be for a user-defined property, or it could be a Unicode property, as
23240      * all of them are considered to be for that package.  For the purposes of
23241      * parsing the rest of the property, strip it off */
23242     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23243         lookup_name +=  STRLENs("utf8::");
23244         j -=  STRLENs("utf8::");
23245         equals_pos -=  STRLENs("utf8::");
23246     }
23247
23248     /* Here, we are either done with the whole property name, if it was simple;
23249      * or are positioned just after the '=' if it is compound. */
23250
23251     if (equals_pos >= 0) {
23252         assert(! stricter); /* We shouldn't have set this yet */
23253
23254         /* Space immediately after the '=' is ignored */
23255         i++;
23256         for (; i < name_len; i++) {
23257             if (! isSPACE_A(name[i])) {
23258                 break;
23259             }
23260         }
23261
23262         /* Most punctuation after the equals indicates a subpattern, like
23263          * \p{foo=/bar/} */
23264         if (   isPUNCT_A(name[i])
23265             && name[i] != '-'
23266             && name[i] != '+'
23267             && name[i] != '_'
23268             && name[i] != '{')
23269         {
23270             /* Find the property.  The table includes the equals sign, so we
23271              * use 'j' as-is */
23272             table_index = match_uniprop((U8 *) lookup_name, j);
23273             if (table_index) {
23274                 const char * const * prop_values
23275                                             = UNI_prop_value_ptrs[table_index];
23276                 SV * subpattern;
23277                 Size_t subpattern_len;
23278                 REGEXP * subpattern_re;
23279                 char open = name[i++];
23280                 char close;
23281                 const char * pos_in_brackets;
23282                 bool escaped = 0;
23283
23284                 /* A backslash means the real delimitter is the next character.
23285                  * */
23286                 if (open == '\\') {
23287                     open = name[i++];
23288                     escaped = 1;
23289                 }
23290
23291                 /* This data structure is constructed so that the matching
23292                  * closing bracket is 3 past its matching opening.  The second
23293                  * set of closing is so that if the opening is something like
23294                  * ']', the closing will be that as well.  Something similar is
23295                  * done in toke.c */
23296                 pos_in_brackets = strchr("([<)]>)]>", open);
23297                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23298
23299                 if (    i >= name_len
23300                     ||  name[name_len-1] != close
23301                     || (escaped && name[name_len-2] != '\\'))
23302                 {
23303                     sv_catpvs(msg, "Unicode property wildcard not terminated");
23304                     goto append_name_to_msg;
23305                 }
23306
23307                 Perl_ck_warner_d(aTHX_
23308                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23309                     "The Unicode property wildcards feature is experimental");
23310
23311                 /* Now create and compile the wildcard subpattern.  Use /iaa
23312                  * because nothing outside of ASCII will match, and it the
23313                  * property values should all match /i.  Note that when the
23314                  * pattern fails to compile, our added text to the user's
23315                  * pattern will be displayed to the user, which is not so
23316                  * desirable. */
23317                 subpattern_len = name_len - i - 1 - escaped;
23318                 subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
23319                                               (unsigned) subpattern_len,
23320                                               name + i);
23321                 subpattern = sv_2mortal(subpattern);
23322                 subpattern_re = re_compile(subpattern, 0);
23323                 assert(subpattern_re);  /* Should have died if didn't compile
23324                                          successfully */
23325
23326                 /* For each legal property value, see if the supplied pattern
23327                  * matches it. */
23328                 while (*prop_values) {
23329                     const char * const entry = *prop_values;
23330                     const Size_t len = strlen(entry);
23331                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23332
23333                     if (pregexec(subpattern_re,
23334                                  (char *) entry,
23335                                  (char *) entry + len,
23336                                  (char *) entry, 0,
23337                                  entry_sv,
23338                                  0))
23339                     { /* Here, matched.  Add to the returned list */
23340                         Size_t total_len = j + len;
23341                         SV * sub_invlist = NULL;
23342                         char * this_string;
23343
23344                         /* We know this is a legal \p{property=value}.  Call
23345                          * the function to return the list of code points that
23346                          * match it */
23347                         Newxz(this_string, total_len + 1, char);
23348                         Copy(lookup_name, this_string, j, char);
23349                         my_strlcat(this_string, entry, total_len + 1);
23350                         SAVEFREEPV(this_string);
23351                         sub_invlist = parse_uniprop_string(this_string,
23352                                                            total_len,
23353                                                            is_utf8,
23354                                                            to_fold,
23355                                                            runtime,
23356                                                            deferrable,
23357                                                            user_defined_ptr,
23358                                                            msg,
23359                                                            level + 1);
23360                         _invlist_union(prop_definition, sub_invlist,
23361                                        &prop_definition);
23362                     }
23363
23364                     prop_values++;  /* Next iteration, look at next propvalue */
23365                 } /* End of looking through property values; (the data
23366                      structure is terminated by a NULL ptr) */
23367
23368                 SvREFCNT_dec_NN(subpattern_re);
23369
23370                 if (prop_definition) {
23371                     return prop_definition;
23372                 }
23373
23374                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23375                 goto append_name_to_msg;
23376             }
23377
23378             /* Here's how khw thinks we should proceed to handle the properties
23379              * not yet done:    Bidi Mirroring Glyph
23380                                 Bidi Paired Bracket
23381                                 Case Folding  (both full and simple)
23382                                 Decomposition Mapping
23383                                 Equivalent Unified Ideograph
23384                                 Name
23385                                 Name Alias
23386                                 Lowercase Mapping  (both full and simple)
23387                                 NFKC Case Fold
23388                                 Titlecase Mapping  (both full and simple)
23389                                 Uppercase Mapping  (both full and simple)
23390              * Move the part that looks at the property values into a perl
23391              * script, like utf8_heavy.pl was done.  This makes things somewhat
23392              * easier, but most importantly, it avoids always adding all these
23393              * strings to the memory usage when the feature is little-used.
23394              *
23395              * The property values would all be concatenated into a single
23396              * string per property with each value on a separate line, and the
23397              * code point it's for on alternating lines.  Then we match the
23398              * user's input pattern m//mg, without having to worry about their
23399              * uses of '^' and '$'.  Only the values that aren't the default
23400              * would be in the strings.  Code points would be in UTF-8.  The
23401              * search pattern that we would construct would look like
23402              * (?: \n (code-point_re) \n (?aam: user-re ) \n )
23403              * And so $1 would contain the code point that matched the user-re.
23404              * For properties where the default is the code point itself, such
23405              * as any of the case changing mappings, the string would otherwise
23406              * consist of all Unicode code points in UTF-8 strung together.
23407              * This would be impractical.  So instead, examine their compiled
23408              * pattern, looking at the ssc.  If none, reject the pattern as an
23409              * error.  Otherwise run the pattern against every code point in
23410              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
23411              * And it might be good to create an API to return the ssc.
23412              *
23413              * For the name properties, a new function could be created in
23414              * charnames which essentially does the same thing as above,
23415              * sharing Name.pl with the other charname functions.  Don't know
23416              * about loose name matching, or algorithmically determined names.
23417              * Decomposition.pl similarly.
23418              *
23419              * It might be that a new pattern modifier would have to be
23420              * created, like /t for resTricTed, which changed the behavior of
23421              * some constructs in their subpattern, like \A. */
23422         } /* End of is a wildcard subppattern */
23423
23424
23425         /* Certain properties whose values are numeric need special handling.
23426          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
23427          * purposes of checking if this is one of those properties */
23428         if (memBEGINPs(lookup_name, j, "is")) {
23429             lookup_offset = 2;
23430         }
23431
23432         /* Then check if it is one of these specially-handled properties.  The
23433          * possibilities are hard-coded because easier this way, and the list
23434          * is unlikely to change.
23435          *
23436          * All numeric value type properties are of this ilk, and are also
23437          * special in a different way later on.  So find those first.  There
23438          * are several numeric value type properties in the Unihan DB (which is
23439          * unlikely to be compiled with perl, but we handle it here in case it
23440          * does get compiled).  They all end with 'numeric'.  The interiors
23441          * aren't checked for the precise property.  This would stop working if
23442          * a cjk property were to be created that ended with 'numeric' and
23443          * wasn't a numeric type */
23444         is_nv_type = memEQs(lookup_name + lookup_offset,
23445                        j - 1 - lookup_offset, "numericvalue")
23446                   || memEQs(lookup_name + lookup_offset,
23447                       j - 1 - lookup_offset, "nv")
23448                   || (   memENDPs(lookup_name + lookup_offset,
23449                             j - 1 - lookup_offset, "numeric")
23450                       && (   memBEGINPs(lookup_name + lookup_offset,
23451                                       j - 1 - lookup_offset, "cjk")
23452                           || memBEGINPs(lookup_name + lookup_offset,
23453                                       j - 1 - lookup_offset, "k")));
23454         if (   is_nv_type
23455             || memEQs(lookup_name + lookup_offset,
23456                       j - 1 - lookup_offset, "canonicalcombiningclass")
23457             || memEQs(lookup_name + lookup_offset,
23458                       j - 1 - lookup_offset, "ccc")
23459             || memEQs(lookup_name + lookup_offset,
23460                       j - 1 - lookup_offset, "age")
23461             || memEQs(lookup_name + lookup_offset,
23462                       j - 1 - lookup_offset, "in")
23463             || memEQs(lookup_name + lookup_offset,
23464                       j - 1 - lookup_offset, "presentin"))
23465         {
23466             unsigned int k;
23467
23468             /* Since the stuff after the '=' is a number, we can't throw away
23469              * '-' willy-nilly, as those could be a minus sign.  Other stricter
23470              * rules also apply.  However, these properties all can have the
23471              * rhs not be a number, in which case they contain at least one
23472              * alphabetic.  In those cases, the stricter rules don't apply.
23473              * But the numeric type properties can have the alphas [Ee] to
23474              * signify an exponent, and it is still a number with stricter
23475              * rules.  So look for an alpha that signifies not-strict */
23476             stricter = TRUE;
23477             for (k = i; k < name_len; k++) {
23478                 if (   isALPHA_A(name[k])
23479                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
23480                 {
23481                     stricter = FALSE;
23482                     break;
23483                 }
23484             }
23485         }
23486
23487         if (stricter) {
23488
23489             /* A number may have a leading '+' or '-'.  The latter is retained
23490              * */
23491             if (name[i] == '+') {
23492                 i++;
23493             }
23494             else if (name[i] == '-') {
23495                 lookup_name[j++] = '-';
23496                 i++;
23497             }
23498
23499             /* Skip leading zeros including single underscores separating the
23500              * zeros, or between the final leading zero and the first other
23501              * digit */
23502             for (; i < name_len - 1; i++) {
23503                 if (    name[i] != '0'
23504                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23505                 {
23506                     break;
23507                 }
23508             }
23509         }
23510     }
23511     else {  /* No '=' */
23512
23513        /* Only a few properties without an '=' should be parsed with stricter
23514         * rules.  The list is unlikely to change. */
23515         if (   memBEGINPs(lookup_name, j, "perl")
23516             && memNEs(lookup_name + 4, j - 4, "space")
23517             && memNEs(lookup_name + 4, j - 4, "word"))
23518         {
23519             stricter = TRUE;
23520
23521             /* We set the inputs back to 0 and the code below will reparse,
23522              * using strict */
23523             i = j = 0;
23524         }
23525     }
23526
23527     /* Here, we have either finished the property, or are positioned to parse
23528      * the remainder, and we know if stricter rules apply.  Finish out, if not
23529      * already done */
23530     for (; i < name_len; i++) {
23531         char cur = name[i];
23532
23533         /* In all instances, case differences are ignored, and we normalize to
23534          * lowercase */
23535         if (isUPPER_A(cur)) {
23536             lookup_name[j++] = toLOWER(cur);
23537             continue;
23538         }
23539
23540         /* An underscore is skipped, but not under strict rules unless it
23541          * separates two digits */
23542         if (cur == '_') {
23543             if (    stricter
23544                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
23545                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
23546             {
23547                 lookup_name[j++] = '_';
23548             }
23549             continue;
23550         }
23551
23552         /* Hyphens are skipped except under strict */
23553         if (cur == '-' && ! stricter) {
23554             continue;
23555         }
23556
23557         /* XXX Bug in documentation.  It says white space skipped adjacent to
23558          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
23559          * in a number */
23560         if (isSPACE_A(cur) && ! stricter) {
23561             continue;
23562         }
23563
23564         lookup_name[j++] = cur;
23565
23566         /* Unless this is a non-trailing slash, we are done with it */
23567         if (i >= name_len - 1 || cur != '/') {
23568             continue;
23569         }
23570
23571         slash_pos = j;
23572
23573         /* A slash in the 'numeric value' property indicates that what follows
23574          * is a denominator.  It can have a leading '+' and '0's that should be
23575          * skipped.  But we have never allowed a negative denominator, so treat
23576          * a minus like every other character.  (No need to rule out a second
23577          * '/', as that won't match anything anyway */
23578         if (is_nv_type) {
23579             i++;
23580             if (i < name_len && name[i] == '+') {
23581                 i++;
23582             }
23583
23584             /* Skip leading zeros including underscores separating digits */
23585             for (; i < name_len - 1; i++) {
23586                 if (   name[i] != '0'
23587                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23588                 {
23589                     break;
23590                 }
23591             }
23592
23593             /* Store the first real character in the denominator */
23594             if (i < name_len) {
23595                 lookup_name[j++] = name[i];
23596             }
23597         }
23598     }
23599
23600     /* Here are completely done parsing the input 'name', and 'lookup_name'
23601      * contains a copy, normalized.
23602      *
23603      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
23604      * different from without the underscores.  */
23605     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
23606            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
23607         && UNLIKELY(name[name_len-1] == '_'))
23608     {
23609         lookup_name[j++] = '&';
23610     }
23611
23612     /* If the original input began with 'In' or 'Is', it could be a subroutine
23613      * call to a user-defined property instead of a Unicode property name. */
23614     if (    name_len - non_pkg_begin > 2
23615         &&  name[non_pkg_begin+0] == 'I'
23616         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
23617     {
23618         /* Names that start with In have different characterstics than those
23619          * that start with Is */
23620         if (name[non_pkg_begin+1] == 's') {
23621             starts_with_Is = TRUE;
23622         }
23623     }
23624     else {
23625         could_be_user_defined = FALSE;
23626     }
23627
23628     if (could_be_user_defined) {
23629         CV* user_sub;
23630
23631         /* If the user defined property returns the empty string, it could
23632          * easily be because the pattern is being compiled before the data it
23633          * actually needs to compile is available.  This could be argued to be
23634          * a bug in the perl code, but this is a change of behavior for Perl,
23635          * so we handle it.  This means that intentionally returning nothing
23636          * will not be resolved until runtime */
23637         bool empty_return = FALSE;
23638
23639         /* Here, the name could be for a user defined property, which are
23640          * implemented as subs. */
23641         user_sub = get_cvn_flags(name, name_len, 0);
23642         if (user_sub) {
23643             const char insecure[] = "Insecure user-defined property";
23644
23645             /* Here, there is a sub by the correct name.  Normally we call it
23646              * to get the property definition */
23647             dSP;
23648             SV * user_sub_sv = MUTABLE_SV(user_sub);
23649             SV * error;     /* Any error returned by calling 'user_sub' */
23650             SV * key;       /* The key into the hash of user defined sub names
23651                              */
23652             SV * placeholder;
23653             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
23654
23655             /* How many times to retry when another thread is in the middle of
23656              * expanding the same definition we want */
23657             PERL_INT_FAST8_T retry_countdown = 10;
23658
23659             DECLARATION_FOR_GLOBAL_CONTEXT;
23660
23661             /* If we get here, we know this property is user-defined */
23662             *user_defined_ptr = TRUE;
23663
23664             /* We refuse to call a potentially tainted subroutine; returning an
23665              * error instead */
23666             if (TAINT_get) {
23667                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23668                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23669                 goto append_name_to_msg;
23670             }
23671
23672             /* In principal, we only call each subroutine property definition
23673              * once during the life of the program.  This guarantees that the
23674              * property definition never changes.  The results of the single
23675              * sub call are stored in a hash, which is used instead for future
23676              * references to this property.  The property definition is thus
23677              * immutable.  But, to allow the user to have a /i-dependent
23678              * definition, we call the sub once for non-/i, and once for /i,
23679              * should the need arise, passing the /i status as a parameter.
23680              *
23681              * We start by constructing the hash key name, consisting of the
23682              * fully qualified subroutine name, preceded by the /i status, so
23683              * that there is a key for /i and a different key for non-/i */
23684             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
23685             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23686                                           non_pkg_begin != 0);
23687             sv_catsv(key, fq_name);
23688             sv_2mortal(key);
23689
23690             /* We only call the sub once throughout the life of the program
23691              * (with the /i, non-/i exception noted above).  That means the
23692              * hash must be global and accessible to all threads.  It is
23693              * created at program start-up, before any threads are created, so
23694              * is accessible to all children.  But this creates some
23695              * complications.
23696              *
23697              * 1) The keys can't be shared, or else problems arise; sharing is
23698              *    turned off at hash creation time
23699              * 2) All SVs in it are there for the remainder of the life of the
23700              *    program, and must be created in the same interpreter context
23701              *    as the hash, or else they will be freed from the wrong pool
23702              *    at global destruction time.  This is handled by switching to
23703              *    the hash's context to create each SV going into it, and then
23704              *    immediately switching back
23705              * 3) All accesses to the hash must be controlled by a mutex, to
23706              *    prevent two threads from getting an unstable state should
23707              *    they simultaneously be accessing it.  The code below is
23708              *    crafted so that the mutex is locked whenever there is an
23709              *    access and unlocked only when the next stable state is
23710              *    achieved.
23711              *
23712              * The hash stores either the definition of the property if it was
23713              * valid, or, if invalid, the error message that was raised.  We
23714              * use the type of SV to distinguish.
23715              *
23716              * There's also the need to guard against the definition expansion
23717              * from infinitely recursing.  This is handled by storing the aTHX
23718              * of the expanding thread during the expansion.  Again the SV type
23719              * is used to distinguish this from the other two cases.  If we
23720              * come to here and the hash entry for this property is our aTHX,
23721              * it means we have recursed, and the code assumes that we would
23722              * infinitely recurse, so instead stops and raises an error.
23723              * (Any recursion has always been treated as infinite recursion in
23724              * this feature.)
23725              *
23726              * If instead, the entry is for a different aTHX, it means that
23727              * that thread has gotten here first, and hasn't finished expanding
23728              * the definition yet.  We just have to wait until it is done.  We
23729              * sleep and retry a few times, returning an error if the other
23730              * thread doesn't complete. */
23731
23732           re_fetch:
23733             USER_PROP_MUTEX_LOCK;
23734
23735             /* If we have an entry for this key, the subroutine has already
23736              * been called once with this /i status. */
23737             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
23738                                                    SvPVX(key), SvCUR(key), 0);
23739             if (saved_user_prop_ptr) {
23740
23741                 /* If the saved result is an inversion list, it is the valid
23742                  * definition of this property */
23743                 if (is_invlist(*saved_user_prop_ptr)) {
23744                     prop_definition = *saved_user_prop_ptr;
23745
23746                     /* The SV in the hash won't be removed until global
23747                      * destruction, so it is stable and we can unlock */
23748                     USER_PROP_MUTEX_UNLOCK;
23749
23750                     /* The caller shouldn't try to free this SV */
23751                     return prop_definition;
23752                 }
23753
23754                 /* Otherwise, if it is a string, it is the error message
23755                  * that was returned when we first tried to evaluate this
23756                  * property.  Fail, and append the message */
23757                 if (SvPOK(*saved_user_prop_ptr)) {
23758                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23759                     sv_catsv(msg, *saved_user_prop_ptr);
23760
23761                     /* The SV in the hash won't be removed until global
23762                      * destruction, so it is stable and we can unlock */
23763                     USER_PROP_MUTEX_UNLOCK;
23764
23765                     return NULL;
23766                 }
23767
23768                 assert(SvIOK(*saved_user_prop_ptr));
23769
23770                 /* Here, we have an unstable entry in the hash.  Either another
23771                  * thread is in the middle of expanding the property's
23772                  * definition, or we are ourselves recursing.  We use the aTHX
23773                  * in it to distinguish */
23774                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
23775
23776                     /* Here, it's another thread doing the expanding.  We've
23777                      * looked as much as we are going to at the contents of the
23778                      * hash entry.  It's safe to unlock. */
23779                     USER_PROP_MUTEX_UNLOCK;
23780
23781                     /* Retry a few times */
23782                     if (retry_countdown-- > 0) {
23783                         PerlProc_sleep(1);
23784                         goto re_fetch;
23785                     }
23786
23787                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23788                     sv_catpvs(msg, "Timeout waiting for another thread to "
23789                                    "define");
23790                     goto append_name_to_msg;
23791                 }
23792
23793                 /* Here, we are recursing; don't dig any deeper */
23794                 USER_PROP_MUTEX_UNLOCK;
23795
23796                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23797                 sv_catpvs(msg,
23798                           "Infinite recursion in user-defined property");
23799                 goto append_name_to_msg;
23800             }
23801
23802             /* Here, this thread has exclusive control, and there is no entry
23803              * for this property in the hash.  So we have the go ahead to
23804              * expand the definition ourselves. */
23805
23806             PUSHSTACKi(PERLSI_MAGIC);
23807             ENTER;
23808
23809             /* Create a temporary placeholder in the hash to detect recursion
23810              * */
23811             SWITCH_TO_GLOBAL_CONTEXT;
23812             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
23813             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
23814             RESTORE_CONTEXT;
23815
23816             /* Now that we have a placeholder, we can let other threads
23817              * continue */
23818             USER_PROP_MUTEX_UNLOCK;
23819
23820             /* Make sure the placeholder always gets destroyed */
23821             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
23822
23823             PUSHMARK(SP);
23824             SAVETMPS;
23825
23826             /* Call the user's function, with the /i status as a parameter.
23827              * Note that we have gone to a lot of trouble to keep this call
23828              * from being within the locked mutex region. */
23829             XPUSHs(boolSV(to_fold));
23830             PUTBACK;
23831
23832             /* The following block was taken from swash_init().  Presumably
23833              * they apply to here as well, though we no longer use a swash --
23834              * khw */
23835             SAVEHINTS();
23836             save_re_context();
23837             /* We might get here via a subroutine signature which uses a utf8
23838              * parameter name, at which point PL_subname will have been set
23839              * but not yet used. */
23840             save_item(PL_subname);
23841
23842             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
23843
23844             SPAGAIN;
23845
23846             error = ERRSV;
23847             if (TAINT_get || SvTRUE(error)) {
23848                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23849                 if (SvTRUE(error)) {
23850                     sv_catpvs(msg, "Error \"");
23851                     sv_catsv(msg, error);
23852                     sv_catpvs(msg, "\"");
23853                 }
23854                 if (TAINT_get) {
23855                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
23856                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23857                 }
23858
23859                 if (name_len > 0) {
23860                     sv_catpvs(msg, " in expansion of ");
23861                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
23862                                                                   name_len,
23863                                                                   name));
23864                 }
23865
23866                 (void) POPs;
23867                 prop_definition = NULL;
23868             }
23869             else {  /* G_SCALAR guarantees a single return value */
23870                 SV * contents = POPs;
23871
23872                 /* The contents is supposed to be the expansion of the property
23873                  * definition.  If the definition is deferrable, and we got an
23874                  * empty string back, set a flag to later defer it (after clean
23875                  * up below). */
23876                 if (      deferrable
23877                     && (! SvPOK(contents) || SvCUR(contents) == 0))
23878                 {
23879                         empty_return = TRUE;
23880                 }
23881                 else { /* Otherwise, call a function to check for valid syntax,
23882                           and handle it */
23883
23884                     prop_definition = handle_user_defined_property(
23885                                                     name, name_len,
23886                                                     is_utf8, to_fold, runtime,
23887                                                     deferrable,
23888                                                     contents, user_defined_ptr,
23889                                                     msg,
23890                                                     level);
23891                 }
23892             }
23893
23894             /* Here, we have the results of the expansion.  Delete the
23895              * placeholder, and if the definition is now known, replace it with
23896              * that definition.  We need exclusive access to the hash, and we
23897              * can't let anyone else in, between when we delete the placeholder
23898              * and add the permanent entry */
23899             USER_PROP_MUTEX_LOCK;
23900
23901             S_delete_recursion_entry(aTHX_ SvPVX(key));
23902
23903             if (    ! empty_return
23904                 && (! prop_definition || is_invlist(prop_definition)))
23905             {
23906                 /* If we got success we use the inversion list defining the
23907                  * property; otherwise use the error message */
23908                 SWITCH_TO_GLOBAL_CONTEXT;
23909                 (void) hv_store_ent(PL_user_def_props,
23910                                     key,
23911                                     ((prop_definition)
23912                                      ? newSVsv(prop_definition)
23913                                      : newSVsv(msg)),
23914                                     0);
23915                 RESTORE_CONTEXT;
23916             }
23917
23918             /* All done, and the hash now has a permanent entry for this
23919              * property.  Give up exclusive control */
23920             USER_PROP_MUTEX_UNLOCK;
23921
23922             FREETMPS;
23923             LEAVE;
23924             POPSTACK;
23925
23926             if (empty_return) {
23927                 goto definition_deferred;
23928             }
23929
23930             if (prop_definition) {
23931
23932                 /* If the definition is for something not known at this time,
23933                  * we toss it, and go return the main property name, as that's
23934                  * the one the user will be aware of */
23935                 if (! is_invlist(prop_definition)) {
23936                     SvREFCNT_dec_NN(prop_definition);
23937                     goto definition_deferred;
23938                 }
23939
23940                 sv_2mortal(prop_definition);
23941             }
23942
23943             /* And return */
23944             return prop_definition;
23945
23946         }   /* End of calling the subroutine for the user-defined property */
23947     }       /* End of it could be a user-defined property */
23948
23949     /* Here it wasn't a user-defined property that is known at this time.  See
23950      * if it is a Unicode property */
23951
23952     lookup_len = j;     /* This is a more mnemonic name than 'j' */
23953
23954     /* Get the index into our pointer table of the inversion list corresponding
23955      * to the property */
23956     table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23957
23958     /* If it didn't find the property ... */
23959     if (table_index == 0) {
23960
23961         /* Try again stripping off any initial 'Is'.  This is because we
23962          * promise that an initial Is is optional.  The same isn't true of
23963          * names that start with 'In'.  Those can match only blocks, and the
23964          * lookup table already has those accounted for. */
23965         if (starts_with_Is) {
23966             lookup_name += 2;
23967             lookup_len -= 2;
23968             equals_pos -= 2;
23969             slash_pos -= 2;
23970
23971             table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23972         }
23973
23974         if (table_index == 0) {
23975             char * canonical;
23976
23977             /* Here, we didn't find it.  If not a numeric type property, and
23978              * can't be a user-defined one, it isn't a legal property */
23979             if (! is_nv_type) {
23980                 if (! could_be_user_defined) {
23981                     goto failed;
23982                 }
23983
23984                 /* Here, the property name is legal as a user-defined one.   At
23985                  * compile time, it might just be that the subroutine for that
23986                  * property hasn't been encountered yet, but at runtime, it's
23987                  * an error to try to use an undefined one */
23988                 if (! deferrable) {
23989                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23990                     sv_catpvs(msg, "Unknown user-defined property name");
23991                     goto append_name_to_msg;
23992                 }
23993
23994                 goto definition_deferred;
23995             } /* End of isn't a numeric type property */
23996
23997             /* The numeric type properties need more work to decide.  What we
23998              * do is make sure we have the number in canonical form and look
23999              * that up. */
24000
24001             if (slash_pos < 0) {    /* No slash */
24002
24003                 /* When it isn't a rational, take the input, convert it to a
24004                  * NV, then create a canonical string representation of that
24005                  * NV. */
24006
24007                 NV value;
24008                 SSize_t value_len = lookup_len - equals_pos;
24009
24010                 /* Get the value */
24011                 if (   value_len <= 0
24012                     || my_atof3(lookup_name + equals_pos, &value,
24013                                 value_len)
24014                           != lookup_name + lookup_len)
24015                 {
24016                     goto failed;
24017                 }
24018
24019                 /* If the value is an integer, the canonical value is integral
24020                  * */
24021                 if (Perl_ceil(value) == value) {
24022                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24023                                             equals_pos, lookup_name, value);
24024                 }
24025                 else {  /* Otherwise, it is %e with a known precision */
24026                     char * exp_ptr;
24027
24028                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24029                                                 equals_pos, lookup_name,
24030                                                 PL_E_FORMAT_PRECISION, value);
24031
24032                     /* The exponent generated is expecting two digits, whereas
24033                      * %e on some systems will generate three.  Remove leading
24034                      * zeros in excess of 2 from the exponent.  We start
24035                      * looking for them after the '=' */
24036                     exp_ptr = strchr(canonical + equals_pos, 'e');
24037                     if (exp_ptr) {
24038                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24039                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24040
24041                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24042
24043                         if (excess_exponent_len > 0) {
24044                             SSize_t leading_zeros = strspn(cur_ptr, "0");
24045                             SSize_t excess_leading_zeros
24046                                     = MIN(leading_zeros, excess_exponent_len);
24047                             if (excess_leading_zeros > 0) {
24048                                 Move(cur_ptr + excess_leading_zeros,
24049                                      cur_ptr,
24050                                      strlen(cur_ptr) - excess_leading_zeros
24051                                        + 1,  /* Copy the NUL as well */
24052                                      char);
24053                             }
24054                         }
24055                     }
24056                 }
24057             }
24058             else {  /* Has a slash.  Create a rational in canonical form  */
24059                 UV numerator, denominator, gcd, trial;
24060                 const char * end_ptr;
24061                 const char * sign = "";
24062
24063                 /* We can't just find the numerator, denominator, and do the
24064                  * division, then use the method above, because that is
24065                  * inexact.  And the input could be a rational that is within
24066                  * epsilon (given our precision) of a valid rational, and would
24067                  * then incorrectly compare valid.
24068                  *
24069                  * We're only interested in the part after the '=' */
24070                 const char * this_lookup_name = lookup_name + equals_pos;
24071                 lookup_len -= equals_pos;
24072                 slash_pos -= equals_pos;
24073
24074                 /* Handle any leading minus */
24075                 if (this_lookup_name[0] == '-') {
24076                     sign = "-";
24077                     this_lookup_name++;
24078                     lookup_len--;
24079                     slash_pos--;
24080                 }
24081
24082                 /* Convert the numerator to numeric */
24083                 end_ptr = this_lookup_name + slash_pos;
24084                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24085                     goto failed;
24086                 }
24087
24088                 /* It better have included all characters before the slash */
24089                 if (*end_ptr != '/') {
24090                     goto failed;
24091                 }
24092
24093                 /* Set to look at just the denominator */
24094                 this_lookup_name += slash_pos;
24095                 lookup_len -= slash_pos;
24096                 end_ptr = this_lookup_name + lookup_len;
24097
24098                 /* Convert the denominator to numeric */
24099                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24100                     goto failed;
24101                 }
24102
24103                 /* It better be the rest of the characters, and don't divide by
24104                  * 0 */
24105                 if (   end_ptr != this_lookup_name + lookup_len
24106                     || denominator == 0)
24107                 {
24108                     goto failed;
24109                 }
24110
24111                 /* Get the greatest common denominator using
24112                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
24113                 gcd = numerator;
24114                 trial = denominator;
24115                 while (trial != 0) {
24116                     UV temp = trial;
24117                     trial = gcd % trial;
24118                     gcd = temp;
24119                 }
24120
24121                 /* If already in lowest possible terms, we have already tried
24122                  * looking this up */
24123                 if (gcd == 1) {
24124                     goto failed;
24125                 }
24126
24127                 /* Reduce the rational, which should put it in canonical form
24128                  * */
24129                 numerator /= gcd;
24130                 denominator /= gcd;
24131
24132                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24133                         equals_pos, lookup_name, sign, numerator, denominator);
24134             }
24135
24136             /* Here, we have the number in canonical form.  Try that */
24137             table_index = match_uniprop((U8 *) canonical, strlen(canonical));
24138             if (table_index == 0) {
24139                 goto failed;
24140             }
24141         }   /* End of still didn't find the property in our table */
24142     }       /* End of       didn't find the property in our table */
24143
24144     /* Here, we have a non-zero return, which is an index into a table of ptrs.
24145      * A negative return signifies that the real index is the absolute value,
24146      * but the result needs to be inverted */
24147     if (table_index < 0) {
24148         invert_return = TRUE;
24149         table_index = -table_index;
24150     }
24151
24152     /* Out-of band indices indicate a deprecated property.  The proper index is
24153      * modulo it with the table size.  And dividing by the table size yields
24154      * an offset into a table constructed by regen/mk_invlists.pl to contain
24155      * the corresponding warning message */
24156     if (table_index > MAX_UNI_KEYWORD_INDEX) {
24157         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24158         table_index %= MAX_UNI_KEYWORD_INDEX;
24159         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24160                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24161                 (int) name_len, name, deprecated_property_msgs[warning_offset]);
24162     }
24163
24164     /* In a few properties, a different property is used under /i.  These are
24165      * unlikely to change, so are hard-coded here. */
24166     if (to_fold) {
24167         if (   table_index == UNI_XPOSIXUPPER
24168             || table_index == UNI_XPOSIXLOWER
24169             || table_index == UNI_TITLE)
24170         {
24171             table_index = UNI_CASED;
24172         }
24173         else if (   table_index == UNI_UPPERCASELETTER
24174                  || table_index == UNI_LOWERCASELETTER
24175 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
24176                  || table_index == UNI_TITLECASELETTER
24177 #  endif
24178         ) {
24179             table_index = UNI_CASEDLETTER;
24180         }
24181         else if (  table_index == UNI_POSIXUPPER
24182                 || table_index == UNI_POSIXLOWER)
24183         {
24184             table_index = UNI_POSIXALPHA;
24185         }
24186     }
24187
24188     /* Create and return the inversion list */
24189     prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
24190     sv_2mortal(prop_definition);
24191
24192
24193     /* See if there is a private use override to add to this definition */
24194     {
24195         COPHH * hinthash = (IN_PERL_COMPILETIME)
24196                            ? CopHINTHASH_get(&PL_compiling)
24197                            : CopHINTHASH_get(PL_curcop);
24198         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24199
24200         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24201
24202             /* See if there is an element in the hints hash for this table */
24203             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24204             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24205
24206             if (pos) {
24207                 bool dummy;
24208                 SV * pu_definition;
24209                 SV * pu_invlist;
24210                 SV * expanded_prop_definition =
24211                             sv_2mortal(invlist_clone(prop_definition, NULL));
24212
24213                 /* If so, it's definition is the string from here to the next
24214                  * \a character.  And its format is the same as a user-defined
24215                  * property */
24216                 pos += SvCUR(pu_lookup);
24217                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24218                 pu_invlist = handle_user_defined_property(lookup_name,
24219                                                           lookup_len,
24220                                                           0, /* Not UTF-8 */
24221                                                           0, /* Not folded */
24222                                                           runtime,
24223                                                           deferrable,
24224                                                           pu_definition,
24225                                                           &dummy,
24226                                                           msg,
24227                                                           level);
24228                 if (TAINT_get) {
24229                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24230                     sv_catpvs(msg, "Insecure private-use override");
24231                     goto append_name_to_msg;
24232                 }
24233
24234                 /* For now, as a safety measure, make sure that it doesn't
24235                  * override non-private use code points */
24236                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24237
24238                 /* Add it to the list to be returned */
24239                 _invlist_union(prop_definition, pu_invlist,
24240                                &expanded_prop_definition);
24241                 prop_definition = expanded_prop_definition;
24242                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24243             }
24244         }
24245     }
24246
24247     if (invert_return) {
24248         _invlist_invert(prop_definition);
24249     }
24250     return prop_definition;
24251
24252
24253   failed:
24254     if (non_pkg_begin != 0) {
24255         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24256         sv_catpvs(msg, "Illegal user-defined property name");
24257     }
24258     else {
24259         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24260         sv_catpvs(msg, "Can't find Unicode property definition");
24261     }
24262     /* FALLTHROUGH */
24263
24264   append_name_to_msg:
24265     {
24266         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
24267         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
24268
24269         sv_catpv(msg, prefix);
24270         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24271         sv_catpv(msg, suffix);
24272     }
24273
24274     return NULL;
24275
24276   definition_deferred:
24277
24278     /* Here it could yet to be defined, so defer evaluation of this
24279      * until its needed at runtime.  We need the fully qualified property name
24280      * to avoid ambiguity, and a trailing newline */
24281     if (! fq_name) {
24282         fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24283                                       non_pkg_begin != 0 /* If has "::" */
24284                                );
24285     }
24286     sv_catpvs(fq_name, "\n");
24287
24288     *user_defined_ptr = TRUE;
24289     return fq_name;
24290 }
24291
24292 #endif
24293
24294 /*
24295  * ex: set ts=8 sts=4 sw=4 et:
24296  */