regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73
  74 /* Note on debug output:
  75  *
  76  * This is set up so that -Dr turns on debugging like all other flags that are
  77  * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
  78  * all regular expressions encountered in a program, and gives a huge amount of
  79  * output for all but the shortest programs.
  80  *
  81  * The ability to output pattern debugging information lexically, and with much
  82  * finer grained control was added, with 'use re qw(Debug ....);' available even
  83  * in non-DEBUGGING builds.  This is accomplished by copying the contents of
  84  * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
  85  * Those files are compiled and linked into the perl executable, and they are
  86  * compiled essentially as if DEBUGGING were enabled, and controlled by calls
  87  * to re.pm.
  88  *
  89  * That would normally mean linking errors when two functions of the same name
  90  * are attempted to be placed into the same executable.  That is solved in one
  91  * of four ways:
  92  *  1)  Static functions aren't known outside the file they are in, so for the
  93  *      many functions of that type in this file, it just isn't a problem.
  94  *  2)  Most externally known functions are enclosed in
  95  *          #ifndef PERL_IN_XSUB_RE
  96  *          ...
  97  *          #endif
  98  *      blocks, so there is only one defintion for them in the whole
  99  *      executable, the one in regcomp.c (or regexec.c).  The implication of
 100  *      that is any debugging info that comes from them is controlled only by
 101  *      -Dr.  Further, any static function they call will also be the version
 102  *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
 103  *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
 104  *      have different names, so that what gets loaded in the executable is
 105  *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
 106  *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
 107  *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
 108  *      versions and their callees are under control of re.pm.   The catch is
 109  *      that references to all these go through the regexp_engine structure,
 110  *      which is initialized in regcomp.h to the Perl_foo versions, and
 111  *      substituted out in lexical scopes where 'use re' is in effect to the
 112  *      'my_foo' ones.   That structure is public API, so it would be a hard
 113  *      sell to add any additional members.
 114  *  4)  For functions in regcomp.c and re_comp.c that are called only from,
 115  *      respectively, regexec.c and re_exec.c, they can have two different
 116  *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
 117  *      embed.fnc.
 118  *
 119  * The bottom line is that if you add code to one of the public functions
 120  * listed in ext/re/re_top.h, debugging automagically works.  But if you write
 121  * a new function that needs to do debugging or there is a chain of calls from
 122  * it that need to do debugging, all functions in the chain should use options
 123  * 2) or 4) above.
 124  *
 125  * A function may have to be split so that debugging stuff is static, but it
 126  * calls out to some other function that only gets compiled in regcomp.c to
 127  * access data that we don't want to duplicate.
 128  */
 129
 130 #include "EXTERN.h"
 131 #define PERL_IN_REGCOMP_C
 132 #include "perl.h"
 133
 134 #define REG_COMP_C
 135 #ifdef PERL_IN_XSUB_RE
 136 #  include "re_comp.h"
 137 EXTERN_C const struct regexp_engine my_reg_engine;
 138 EXTERN_C const struct regexp_engine wild_reg_engine;
 139 #else
 140 #  include "regcomp.h"
 141 #endif
 142
 143 #include "invlist_inline.h"
 144 #include "unicode_constants.h"
 145
 146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
 147  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
 149  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 152
 153 #ifndef STATIC
 154 #define STATIC  static
 155 #endif
 156
 157 /* this is a chain of data about sub patterns we are processing that
 158    need to be handled separately/specially in study_chunk. Its so
 159    we can simulate recursion without losing state.  */
 160 struct scan_frame;
 161 typedef struct scan_frame {
 162     regnode *last_regnode;      /* last node to process in this frame */
 163     regnode *next_regnode;      /* next node to process when last is reached */
 164     U32 prev_recursed_depth;
 165     I32 stopparen;              /* what stopparen do we use */
 166     bool in_gosub;              /* this or an outer frame is for GOSUB */
 167
 168     struct scan_frame *this_prev_frame; /* this previous frame */
 169     struct scan_frame *prev_frame;      /* previous frame */
 170     struct scan_frame *next_frame;      /* next frame */
 171 } scan_frame;
 172
 173 /* Certain characters are output as a sequence with the first being a
 174  * backslash. */
 175 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
 176
 177
 178 struct RExC_state_t {
 179     U32         flags;                  /* RXf_* are we folding, multilining? */
 180     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 181     char        *precomp;               /* uncompiled string. */
 182     char        *precomp_end;           /* pointer to end of uncompiled string. */
 183     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 184     regexp      *rx;                    /* perl core regexp structure */
 185     regexp_internal     *rxi;           /* internal data for regexp object
 186                                            pprivate field */
 187     char        *start;                 /* Start of input for compile */
 188     char        *end;                   /* End of input for compile */
 189     char        *parse;                 /* Input-scan pointer. */
 190     char        *copy_start;            /* start of copy of input within
 191                                            constructed parse string */
 192     char        *save_copy_start;       /* Provides one level of saving
 193                                            and restoring 'copy_start' */
 194     char        *copy_start_in_input;   /* Position in input string
 195                                            corresponding to copy_start */
 196     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 197     regnode     *emit_start;            /* Start of emitted-code area */
 198     regnode_offset emit;                /* Code-emit pointer */
 199     I32         naughty;                /* How bad is this pattern? */
 200     I32         sawback;                /* Did we see \1, ...? */
 201     SSize_t     size;                   /* Number of regnode equivalents in
 202                                            pattern */
 203     Size_t      sets_depth;              /* Counts recursion depth of already-
 204                                            compiled regex set patterns */
 205     U32         seen;
 206
 207     I32      parens_buf_size;           /* #slots malloced open/close_parens */
 208     regnode_offset *open_parens;        /* offsets to open parens */
 209     regnode_offset *close_parens;       /* offsets to close parens */
 210     HV          *paren_names;           /* Paren names */
 211
 212     /* position beyond 'precomp' of the warning message furthest away from
 213      * 'precomp'.  During the parse, no warnings are raised for any problems
 214      * earlier in the parse than this position.  This works if warnings are
 215      * raised the first time a given spot is parsed, and if only one
 216      * independent warning is raised for any given spot */
 217     Size_t      latest_warn_offset;
 218
 219     I32         npar;                   /* Capture buffer count so far in the
 220                                            parse, (OPEN) plus one. ("par" 0 is
 221                                            the whole pattern)*/
 222     I32         total_par;              /* During initial parse, is either 0,
 223                                            or -1; the latter indicating a
 224                                            reparse is needed.  After that pass,
 225                                            it is what 'npar' became after the
 226                                            pass.  Hence, it being > 0 indicates
 227                                            we are in a reparse situation */
 228     I32         nestroot;               /* root parens we are in - used by
 229                                            accept */
 230     I32         seen_zerolen;
 231     regnode     *end_op;                /* END node in program */
 232     I32         utf8;           /* whether the pattern is utf8 or not */
 233     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 234                                 /* XXX use this for future optimisation of case
 235                                  * where pattern must be upgraded to utf8. */
 236     I32         uni_semantics;  /* If a d charset modifier should use unicode
 237                                    rules, even if the pattern is not in
 238                                    utf8 */
 239
 240     I32         recurse_count;          /* Number of recurse regops we have generated */
 241     regnode     **recurse;              /* Recurse regops */
 242     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 243                                            through */
 244     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 245     I32         in_lookbehind;
 246     I32         in_lookahead;
 247     I32         contains_locale;
 248     I32         override_recoding;
 249     I32         recode_x_to_native;
 250     I32         in_multi_char_class;
 251     int         code_index;             /* next code_blocks[] slot */
 252     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 253                                             within pattern */
 254     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 255     scan_frame *frame_head;
 256     scan_frame *frame_last;
 257     U32         frame_count;
 258     AV         *warn_text;
 259     HV         *unlexed_names;
 260     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 261 #ifdef DEBUGGING
 262     const char  *lastparse;
 263     I32         lastnum;
 264     U32         study_chunk_recursed_count;
 265     AV          *paren_name_list;       /* idx -> name */
 266     SV          *mysv1;
 267     SV          *mysv2;
 268
 269 #define RExC_lastparse  (pRExC_state->lastparse)
 270 #define RExC_lastnum    (pRExC_state->lastnum)
 271 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 272 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 273 #define RExC_mysv       (pRExC_state->mysv1)
 274 #define RExC_mysv1      (pRExC_state->mysv1)
 275 #define RExC_mysv2      (pRExC_state->mysv2)
 276
 277 #endif
 278     bool        seen_d_op;
 279     bool        strict;
 280     bool        study_started;
 281     bool        in_script_run;
 282     bool        use_BRANCHJ;
 283     bool        sWARN_EXPERIMENTAL__VLB;
 284     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
 285 };
 286
 287 #define RExC_flags      (pRExC_state->flags)
 288 #define RExC_pm_flags   (pRExC_state->pm_flags)
 289 #define RExC_precomp    (pRExC_state->precomp)
 290 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 291 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 292 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
 293 #define RExC_precomp_end (pRExC_state->precomp_end)
 294 #define RExC_rx_sv      (pRExC_state->rx_sv)
 295 #define RExC_rx         (pRExC_state->rx)
 296 #define RExC_rxi        (pRExC_state->rxi)
 297 #define RExC_start      (pRExC_state->start)
 298 #define RExC_end        (pRExC_state->end)
 299 #define RExC_parse      (pRExC_state->parse)
 300 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 301 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 302 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 303                                                    under /d from /u ? */
 304
 305 #ifdef RE_TRACK_PATTERN_OFFSETS
 306 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 307                                                          others */
 308 #endif
 309 #define RExC_emit       (pRExC_state->emit)
 310 #define RExC_emit_start (pRExC_state->emit_start)
 311 #define RExC_sawback    (pRExC_state->sawback)
 312 #define RExC_seen       (pRExC_state->seen)
 313 #define RExC_size       (pRExC_state->size)
 314 #define RExC_maxlen        (pRExC_state->maxlen)
 315 #define RExC_npar       (pRExC_state->npar)
 316 #define RExC_total_parens       (pRExC_state->total_par)
 317 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
 318 #define RExC_nestroot   (pRExC_state->nestroot)
 319 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 320 #define RExC_utf8       (pRExC_state->utf8)
 321 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 322 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 323 #define RExC_open_parens        (pRExC_state->open_parens)
 324 #define RExC_close_parens       (pRExC_state->close_parens)
 325 #define RExC_end_op     (pRExC_state->end_op)
 326 #define RExC_paren_names        (pRExC_state->paren_names)
 327 #define RExC_recurse    (pRExC_state->recurse)
 328 #define RExC_recurse_count      (pRExC_state->recurse_count)
 329 #define RExC_sets_depth         (pRExC_state->sets_depth)
 330 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 331 #define RExC_study_chunk_recursed_bytes  \
 332                                    (pRExC_state->study_chunk_recursed_bytes)
 333 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 334 #define RExC_in_lookahead       (pRExC_state->in_lookahead)
 335 #define RExC_contains_locale    (pRExC_state->contains_locale)
 336 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 337
 338 #ifdef EBCDIC
 339 #  define SET_recode_x_to_native(x)                                         \
 340                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
 341 #else
 342 #  define SET_recode_x_to_native(x) NOOP
 343 #endif
 344
 345 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 346 #define RExC_frame_head (pRExC_state->frame_head)
 347 #define RExC_frame_last (pRExC_state->frame_last)
 348 #define RExC_frame_count (pRExC_state->frame_count)
 349 #define RExC_strict (pRExC_state->strict)
 350 #define RExC_study_started      (pRExC_state->study_started)
 351 #define RExC_warn_text (pRExC_state->warn_text)
 352 #define RExC_in_script_run      (pRExC_state->in_script_run)
 353 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 354 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
 355 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
 356 #define RExC_unlexed_names (pRExC_state->unlexed_names)
 357
 358 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 359  * a flag to disable back-off on the fixed/floating substrings - if it's
 360  * a high complexity pattern we assume the benefit of avoiding a full match
 361  * is worth the cost of checking for the substrings even if they rarely help.
 362  */
 363 #define RExC_naughty    (pRExC_state->naughty)
 364 #define TOO_NAUGHTY (10)
 365 #define MARK_NAUGHTY(add) \
 366     if (RExC_naughty < TOO_NAUGHTY) \
 367         RExC_naughty += (add)
 368 #define MARK_NAUGHTY_EXP(exp, add) \
 369     if (RExC_naughty < TOO_NAUGHTY) \
 370         RExC_naughty += RExC_naughty / (exp) + (add)
 371
 372 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 373 #define ISMULT2(s)      (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
 374
 375 /*
 376  * Flags to be passed up and down.
 377  */
 378 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 379                                    non-null ones. */
 380
 381 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 382  * character.  (There needs to be a case: in the switch statement in regexec.c
 383  * for any node marked SIMPLE.)  Note that this is not the same thing as
 384  * REGNODE_SIMPLE */
 385 #define SIMPLE          0x02
 386 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 387 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 388 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 389 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 390                                    calcuate sizes as UTF-8 */
 391
 392 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 393
 394 /* whether trie related optimizations are enabled */
 395 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 396 #define TRIE_STUDY_OPT
 397 #define FULL_TRIE_STUDY
 398 #define TRIE_STCLASS
 399 #endif
 400
 401
 402
 403 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 404 #define PBITVAL(paren) (1 << ((paren) & 7))
 405 #define PAREN_OFFSET(depth) \
 406     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
 407 #define PAREN_TEST(depth, paren) \
 408     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
 409 #define PAREN_SET(depth, paren) \
 410     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
 411 #define PAREN_UNSET(depth, paren) \
 412     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
 413
 414 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 415                                      if (!UTF) {                           \
 416                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 417                                          return 0;                         \
 418                                      }                                     \
 419                              } STMT_END
 420
 421 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 422  * a flag that indicates we need to override /d with /u as a result of
 423  * something in the pattern.  It should only be used in regards to calling
 424  * set_regex_charset() or get_regex_charset() */
 425 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 426     STMT_START {                                                            \
 427             if (DEPENDS_SEMANTICS) {                                        \
 428                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 429                 RExC_uni_semantics = 1;                                     \
 430                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
 431                     /* No need to restart the parse if we haven't seen      \
 432                      * anything that differs between /u and /d, and no need \
 433                      * to restart immediately if we're going to reparse     \
 434                      * anyway to count parens */                            \
 435                     *flagp |= RESTART_PARSE;                                \
 436                     return restart_retval;                                  \
 437                 }                                                           \
 438             }                                                               \
 439     } STMT_END
 440
 441 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 442     STMT_START {                                                            \
 443                 RExC_use_BRANCHJ = 1;                                       \
 444                 *flagp |= RESTART_PARSE;                                    \
 445                 return restart_retval;                                      \
 446     } STMT_END
 447
 448 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
 449  * less.  After that, it must always be positive, because the whole re is
 450  * considered to be surrounded by virtual parens.  Setting it to negative
 451  * indicates there is some construct that needs to know the actual number of
 452  * parens to be properly handled.  And that means an extra pass will be
 453  * required after we've counted them all */
 454 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
 455 #define REQUIRE_PARENS_PASS                                                 \
 456     STMT_START {  /* No-op if have completed a pass */                      \
 457                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
 458     } STMT_END
 459 #define IN_PARENS_PASS (RExC_total_parens < 0)
 460
 461
 462 /* This is used to return failure (zero) early from the calling function if
 463  * various flags in 'flags' are set.  Two flags always cause a return:
 464  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 465  * additional flags that should cause a return; 0 if none.  If the return will
 466  * be done, '*flagp' is first set to be all of the flags that caused the
 467  * return. */
 468 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 469     STMT_START {                                                            \
 470             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 471                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 472                 return 0;                                                   \
 473             }                                                               \
 474     } STMT_END
 475
 476 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 477
 478 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 479                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 480 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 481                                     if (MUST_RESTART(*(flagp))) return 0
 482
 483 /* This converts the named class defined in regcomp.h to its equivalent class
 484  * number defined in handy.h. */
 485 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 486 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 487
 488 #define _invlist_union_complement_2nd(a, b, output) \
 489                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 490 #define _invlist_intersection_complement_2nd(a, b, output) \
 491                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 492
 493 /* We add a marker if we are deferring expansion of a property that is both
 494  * 1) potentiallly user-defined; and
 495  * 2) could also be an official Unicode property.
 496  *
 497  * Without this marker, any deferred expansion can only be for a user-defined
 498  * one.  This marker shouldn't conflict with any that could be in a legal name,
 499  * and is appended to its name to indicate this.  There is a string and
 500  * character form */
 501 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
 502 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
 503
 504 /* What is infinity for optimization purposes */
 505 #define OPTIMIZE_INFTY  SSize_t_MAX
 506
 507 /* About scan_data_t.
 508
 509   During optimisation we recurse through the regexp program performing
 510   various inplace (keyhole style) optimisations. In addition study_chunk
 511   and scan_commit populate this data structure with information about
 512   what strings MUST appear in the pattern. We look for the longest
 513   string that must appear at a fixed location, and we look for the
 514   longest string that may appear at a floating location. So for instance
 515   in the pattern:
 516
 517     /FOO[xX]A.*B[xX]BAR/
 518
 519   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 520   strings (because they follow a .* construct). study_chunk will identify
 521   both FOO and BAR as being the longest fixed and floating strings respectively.
 522
 523   The strings can be composites, for instance
 524
 525      /(f)(o)(o)/
 526
 527   will result in a composite fixed substring 'foo'.
 528
 529   For each string some basic information is maintained:
 530
 531   - min_offset
 532     This is the position the string must appear at, or not before.
 533     It also implicitly (when combined with minlenp) tells us how many
 534     characters must match before the string we are searching for.
 535     Likewise when combined with minlenp and the length of the string it
 536     tells us how many characters must appear after the string we have
 537     found.
 538
 539   - max_offset
 540     Only used for floating strings. This is the rightmost point that
 541     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
 542     string can occur infinitely far to the right.
 543     For fixed strings, it is equal to min_offset.
 544
 545   - minlenp
 546     A pointer to the minimum number of characters of the pattern that the
 547     string was found inside. This is important as in the case of positive
 548     lookahead or positive lookbehind we can have multiple patterns
 549     involved. Consider
 550
 551     /(?=FOO).*F/
 552
 553     The minimum length of the pattern overall is 3, the minimum length
 554     of the lookahead part is 3, but the minimum length of the part that
 555     will actually match is 1. So 'FOO's minimum length is 3, but the
 556     minimum length for the F is 1. This is important as the minimum length
 557     is used to determine offsets in front of and behind the string being
 558     looked for.  Since strings can be composites this is the length of the
 559     pattern at the time it was committed with a scan_commit. Note that
 560     the length is calculated by study_chunk, so that the minimum lengths
 561     are not known until the full pattern has been compiled, thus the
 562     pointer to the value.
 563
 564   - lookbehind
 565
 566     In the case of lookbehind the string being searched for can be
 567     offset past the start point of the final matching string.
 568     If this value was just blithely removed from the min_offset it would
 569     invalidate some of the calculations for how many chars must match
 570     before or after (as they are derived from min_offset and minlen and
 571     the length of the string being searched for).
 572     When the final pattern is compiled and the data is moved from the
 573     scan_data_t structure into the regexp structure the information
 574     about lookbehind is factored in, with the information that would
 575     have been lost precalculated in the end_shift field for the
 576     associated string.
 577
 578   The fields pos_min and pos_delta are used to store the minimum offset
 579   and the delta to the maximum offset at the current point in the pattern.
 580
 581 */
 582
 583 struct scan_data_substrs {
 584     SV      *str;       /* longest substring found in pattern */
 585     SSize_t min_offset; /* earliest point in string it can appear */
 586     SSize_t max_offset; /* latest point in string it can appear */
 587     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 588     SSize_t lookbehind; /* is the pos of the string modified by LB */
 589     I32 flags;          /* per substring SF_* and SCF_* flags */
 590 };
 591
 592 typedef struct scan_data_t {
 593     /*I32 len_min;      unused */
 594     /*I32 len_delta;    unused */
 595     SSize_t pos_min;
 596     SSize_t pos_delta;
 597     SV *last_found;
 598     SSize_t last_end;       /* min value, <0 unless valid. */
 599     SSize_t last_start_min;
 600     SSize_t last_start_max;
 601     U8      cur_is_floating; /* whether the last_* values should be set as
 602                               * the next fixed (0) or floating (1)
 603                               * substring */
 604
 605     /* [0] is longest fixed substring so far, [1] is longest float so far */
 606     struct scan_data_substrs  substrs[2];
 607
 608     I32 flags;             /* common SF_* and SCF_* flags */
 609     I32 whilem_c;
 610     SSize_t *last_closep;
 611     regnode_ssc *start_class;
 612 } scan_data_t;
 613
 614 /*
 615  * Forward declarations for pregcomp()'s friends.
 616  */
 617
 618 static const scan_data_t zero_scan_data = {
 619     0, 0, NULL, 0, 0, 0, 0,
 620     {
 621         { NULL, 0, 0, 0, 0, 0 },
 622         { NULL, 0, 0, 0, 0, 0 },
 623     },
 624     0, 0, NULL, NULL
 625 };
 626
 627 /* study flags */
 628
 629 #define SF_BEFORE_SEOL          0x0001
 630 #define SF_BEFORE_MEOL          0x0002
 631 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 632
 633 #define SF_IS_INF               0x0040
 634 #define SF_HAS_PAR              0x0080
 635 #define SF_IN_PAR               0x0100
 636 #define SF_HAS_EVAL             0x0200
 637
 638
 639 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 640  * longest substring in the pattern. When it is not set the optimiser keeps
 641  * track of position, but does not keep track of the actual strings seen,
 642  *
 643  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 644  * /foo/i will not.
 645  *
 646  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 647  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 648  * turned off because of the alternation (BRANCH). */
 649 #define SCF_DO_SUBSTR           0x0400
 650
 651 #define SCF_DO_STCLASS_AND      0x0800
 652 #define SCF_DO_STCLASS_OR       0x1000
 653 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 654 #define SCF_WHILEM_VISITED_POS  0x2000
 655
 656 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 657 #define SCF_SEEN_ACCEPT         0x8000
 658 #define SCF_TRIE_DOING_RESTUDY 0x10000
 659 #define SCF_IN_DEFINE          0x20000
 660
 661
 662
 663
 664 #define UTF cBOOL(RExC_utf8)
 665
 666 /* The enums for all these are ordered so things work out correctly */
 667 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 668 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 669                                                      == REGEX_DEPENDS_CHARSET)
 670 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 671 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 672                                                      >= REGEX_UNICODE_CHARSET)
 673 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 674                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 675 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 676                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 677 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 678                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 679
 680 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 681
 682 /* For programs that want to be strictly Unicode compatible by dying if any
 683  * attempt is made to match a non-Unicode code point against a Unicode
 684  * property.  */
 685 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 686
 687 #define OOB_NAMEDCLASS          -1
 688
 689 /* There is no code point that is out-of-bounds, so this is problematic.  But
 690  * its only current use is to initialize a variable that is always set before
 691  * looked at. */
 692 #define OOB_UNICODE             0xDEADBEEF
 693
 694 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 695
 696
 697 /* length of regex to show in messages that don't mark a position within */
 698 #define RegexLengthToShowInErrorMessages 127
 699
 700 /*
 701  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 702  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 703  * op/pragma/warn/regcomp.
 704  */
 705 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 706 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 707
 708 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 709                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 710
 711 /* The code in this file in places uses one level of recursion with parsing
 712  * rebased to an alternate string constructed by us in memory.  This can take
 713  * the form of something that is completely different from the input, or
 714  * something that uses the input as part of the alternate.  In the first case,
 715  * there should be no possibility of an error, as we are in complete control of
 716  * the alternate string.  But in the second case we don't completely control
 717  * the input portion, so there may be errors in that.  Here's an example:
 718  *      /[abc\x{DF}def]/ui
 719  * is handled specially because \x{df} folds to a sequence of more than one
 720  * character: 'ss'.  What is done is to create and parse an alternate string,
 721  * which looks like this:
 722  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 723  * where it uses the input unchanged in the middle of something it constructs,
 724  * which is a branch for the DF outside the character class, and clustering
 725  * parens around the whole thing. (It knows enough to skip the DF inside the
 726  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 727  * need to be reported.  The general situation looks like this:
 728  *
 729  *                                       |<------- identical ------>|
 730  *              sI                       tI               xI       eI
 731  * Input:       ---------------------------------------------------------------
 732  * Constructed:         ---------------------------------------------------
 733  *                      sC               tC               xC       eC     EC
 734  *                                       |<------- identical ------>|
 735  *
 736  * sI..eI   is the portion of the input pattern we are concerned with here.
 737  * sC..EC   is the constructed substitute parse string.
 738  *  sC..tC  is constructed by us
 739  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 740  *          In the diagram, these are vertically aligned.
 741  *  eC..EC  is also constructed by us.
 742  * xC       is the position in the substitute parse string where we found a
 743  *          problem.
 744  * xI       is the position in the original pattern corresponding to xC.
 745  *
 746  * We want to display a message showing the real input string.  Thus we need to
 747  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 748  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 749  * get:
 750  *      xI = tI + (xC - tC)
 751  *
 752  * When the substitute parse is constructed, the code needs to set:
 753  *      RExC_start (sC)
 754  *      RExC_end (eC)
 755  *      RExC_copy_start_in_input  (tI)
 756  *      RExC_copy_start_in_constructed (tC)
 757  * and restore them when done.
 758  *
 759  * During normal processing of the input pattern, both
 760  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 761  * sI, so that xC equals xI.
 762  */
 763
 764 #define sI              RExC_precomp
 765 #define eI              RExC_precomp_end
 766 #define sC              RExC_start
 767 #define eC              RExC_end
 768 #define tI              RExC_copy_start_in_input
 769 #define tC              RExC_copy_start_in_constructed
 770 #define xI(xC)          (tI + (xC - tC))
 771 #define xI_offset(xC)   (xI(xC) - sI)
 772
 773 #define REPORT_LOCATION_ARGS(xC)                                            \
 774     UTF8fARG(UTF,                                                           \
 775              (xI(xC) > eI) /* Don't run off end */                          \
 776               ? eI - sI   /* Length before the <--HERE */                   \
 777               : ((xI_offset(xC) >= 0)                                       \
 778                  ? xI_offset(xC)                                            \
 779                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 780                                     IVdf " trying to output message for "   \
 781                                     " pattern %.*s",                        \
 782                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 783                                     ((int) (eC - sC)), sC), 0)),            \
 784              sI),         /* The input pattern printed up to the <--HERE */ \
 785     UTF8fARG(UTF,                                                           \
 786              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 787              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 788
 789 /* Used to point after bad bytes for an error message, but avoid skipping
 790  * past a nul byte. */
 791 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 792
 793 /* Set up to clean up after our imminent demise */
 794 #define PREPARE_TO_DIE                                                      \
 795     STMT_START {                                                            \
 796         if (RExC_rx_sv)                                                     \
 797             SAVEFREESV(RExC_rx_sv);                                         \
 798         if (RExC_open_parens)                                               \
 799             SAVEFREEPV(RExC_open_parens);                                   \
 800         if (RExC_close_parens)                                              \
 801             SAVEFREEPV(RExC_close_parens);                                  \
 802     } STMT_END
 803
 804 /*
 805  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 806  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 807  * "...".
 808  */
 809 #define _FAIL(code) STMT_START {                                        \
 810     const char *ellipses = "";                                          \
 811     IV len = RExC_precomp_end - RExC_precomp;                           \
 812                                                                         \
 813     PREPARE_TO_DIE;                                                     \
 814     if (len > RegexLengthToShowInErrorMessages) {                       \
 815         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 816         len = RegexLengthToShowInErrorMessages - 10;                    \
 817         ellipses = "...";                                               \
 818     }                                                                   \
 819     code;                                                               \
 820 } STMT_END
 821
 822 #define FAIL(msg) _FAIL(                            \
 823     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 824             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 825
 826 #define FAIL2(msg,arg) _FAIL(                       \
 827     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 828             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 829
 830 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
 831     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 832      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 833
 834 /*
 835  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 836  */
 837 #define Simple_vFAIL(m) STMT_START {                                    \
 838     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 839             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 840 } STMT_END
 841
 842 /*
 843  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 844  */
 845 #define vFAIL(m) STMT_START {                           \
 846     PREPARE_TO_DIE;                                     \
 847     Simple_vFAIL(m);                                    \
 848 } STMT_END
 849
 850 /*
 851  * Like Simple_vFAIL(), but accepts two arguments.
 852  */
 853 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 854     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,                \
 855                       REPORT_LOCATION_ARGS(RExC_parse));        \
 856 } STMT_END
 857
 858 /*
 859  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 860  */
 861 #define vFAIL2(m,a1) STMT_START {                       \
 862     PREPARE_TO_DIE;                                     \
 863     Simple_vFAIL2(m, a1);                               \
 864 } STMT_END
 865
 866
 867 /*
 868  * Like Simple_vFAIL(), but accepts three arguments.
 869  */
 870 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 871     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,            \
 872             REPORT_LOCATION_ARGS(RExC_parse));                  \
 873 } STMT_END
 874
 875 /*
 876  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 877  */
 878 #define vFAIL3(m,a1,a2) STMT_START {                    \
 879     PREPARE_TO_DIE;                                     \
 880     Simple_vFAIL3(m, a1, a2);                           \
 881 } STMT_END
 882
 883 /*
 884  * Like Simple_vFAIL(), but accepts four arguments.
 885  */
 886 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 887     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,        \
 888             REPORT_LOCATION_ARGS(RExC_parse));                  \
 889 } STMT_END
 890
 891 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 892     PREPARE_TO_DIE;                                     \
 893     Simple_vFAIL4(m, a1, a2, a3);                       \
 894 } STMT_END
 895
 896 /* A specialized version of vFAIL2 that works with UTF8f */
 897 #define vFAIL2utf8f(m, a1) STMT_START {             \
 898     PREPARE_TO_DIE;                                 \
 899     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
 900             REPORT_LOCATION_ARGS(RExC_parse));      \
 901 } STMT_END
 902
 903 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 904     PREPARE_TO_DIE;                                     \
 905     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
 906             REPORT_LOCATION_ARGS(RExC_parse));          \
 907 } STMT_END
 908
 909 /* Setting this to NULL is a signal to not output warnings */
 910 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
 911     STMT_START {                                                            \
 912       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
 913       RExC_copy_start_in_constructed = NULL;                                \
 914     } STMT_END
 915 #define RESTORE_WARNINGS                                                    \
 916     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
 917
 918 /* Since a warning can be generated multiple times as the input is reparsed, we
 919  * output it the first time we come to that point in the parse, but suppress it
 920  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 921  * generate any warnings */
 922 #define TO_OUTPUT_WARNINGS(loc)                                         \
 923   (   RExC_copy_start_in_constructed                                    \
 924    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 925
 926 /* After we've emitted a warning, we save the position in the input so we don't
 927  * output it again */
 928 #define UPDATE_WARNINGS_LOC(loc)                                        \
 929     STMT_START {                                                        \
 930         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 931             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
 932                                                        - RExC_precomp;  \
 933         }                                                               \
 934     } STMT_END
 935
 936 /* 'warns' is the output of the packWARNx macro used in 'code' */
 937 #define _WARN_HELPER(loc, warns, code)                                  \
 938     STMT_START {                                                        \
 939         if (! RExC_copy_start_in_constructed) {                         \
 940             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 941                               " expected at '%s'",                      \
 942                               __FILE__, __LINE__, loc);                 \
 943         }                                                               \
 944         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 945             if (ckDEAD(warns))                                          \
 946                 PREPARE_TO_DIE;                                         \
 947             code;                                                       \
 948             UPDATE_WARNINGS_LOC(loc);                                   \
 949         }                                                               \
 950     } STMT_END
 951
 952 /* m is not necessarily a "literal string", in this macro */
 953 #define warn_non_literal_string(loc, packed_warn, m)                    \
 954     _WARN_HELPER(loc, packed_warn,                                      \
 955                       Perl_warner(aTHX_ packed_warn,                    \
 956                                        "%s" REPORT_LOCATION,            \
 957                                   m, REPORT_LOCATION_ARGS(loc)))
 958 #define reg_warn_non_literal_string(loc, m)                             \
 959                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
 960
 961 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
 962     STMT_START {                                                            \
 963                 char * format;                                              \
 964                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
 965                 Newx(format, format_size, char);                            \
 966                 my_strlcpy(format, m, format_size);                         \
 967                 my_strlcat(format, REPORT_LOCATION, format_size);           \
 968                 SAVEFREEPV(format);                                         \
 969                 _WARN_HELPER(loc, packwarn,                                 \
 970                       Perl_ck_warner(aTHX_ packwarn,                        \
 971                                         format,                             \
 972                                         a1, REPORT_LOCATION_ARGS(loc)));    \
 973     } STMT_END
 974
 975 #define ckWARNreg(loc,m)                                                \
 976     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 977                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 978                                           m REPORT_LOCATION,            \
 979                                           REPORT_LOCATION_ARGS(loc)))
 980
 981 #define vWARN(loc, m)                                                   \
 982     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 983                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 984                                        m REPORT_LOCATION,               \
 985                                        REPORT_LOCATION_ARGS(loc)))      \
 986
 987 #define vWARN_dep(loc, m)                                               \
 988     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 989                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 990                                        m REPORT_LOCATION,               \
 991                                        REPORT_LOCATION_ARGS(loc)))
 992
 993 #define ckWARNdep(loc,m)                                                \
 994     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 995                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 996                                             m REPORT_LOCATION,          \
 997                                             REPORT_LOCATION_ARGS(loc)))
 998
 999 #define ckWARNregdep(loc,m)                                                 \
1000     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
1001                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
1002                                                       WARN_REGEXP),         \
1003                                              m REPORT_LOCATION,             \
1004                                              REPORT_LOCATION_ARGS(loc)))
1005
1006 #define ckWARN2reg_d(loc,m, a1)                                             \
1007     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1008                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1009                                             m REPORT_LOCATION,              \
1010                                             a1, REPORT_LOCATION_ARGS(loc)))
1011
1012 #define ckWARN2reg(loc, m, a1)                                              \
1013     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1014                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1015                                           m REPORT_LOCATION,                \
1016                                           a1, REPORT_LOCATION_ARGS(loc)))
1017
1018 #define vWARN3(loc, m, a1, a2)                                              \
1019     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1020                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1021                                        m REPORT_LOCATION,                   \
1022                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
1023
1024 #define ckWARN3reg(loc, m, a1, a2)                                          \
1025     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1026                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1027                                           m REPORT_LOCATION,                \
1028                                           a1, a2,                           \
1029                                           REPORT_LOCATION_ARGS(loc)))
1030
1031 #define vWARN4(loc, m, a1, a2, a3)                                      \
1032     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1033                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1034                                        m REPORT_LOCATION,               \
1035                                        a1, a2, a3,                      \
1036                                        REPORT_LOCATION_ARGS(loc)))
1037
1038 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
1039     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1040                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1041                                           m REPORT_LOCATION,            \
1042                                           a1, a2, a3,                   \
1043                                           REPORT_LOCATION_ARGS(loc)))
1044
1045 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
1046     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1047                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1048                                        m REPORT_LOCATION,               \
1049                                        a1, a2, a3, a4,                  \
1050                                        REPORT_LOCATION_ARGS(loc)))
1051
1052 #define ckWARNexperimental(loc, class, m)                               \
1053     STMT_START {                                                        \
1054         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1055             RExC_warned_ ## class = 1;                                  \
1056             _WARN_HELPER(loc, packWARN(class),                          \
1057                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1058                                             m REPORT_LOCATION,          \
1059                                             REPORT_LOCATION_ARGS(loc)));\
1060         }                                                               \
1061     } STMT_END
1062
1063 /* Convert between a pointer to a node and its offset from the beginning of the
1064  * program */
1065 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1066 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1067
1068 /* Macros for recording node offsets.   20001227 mjd@plover.com
1069  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
1070  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
1071  * Element 0 holds the number n.
1072  * Position is 1 indexed.
1073  */
1074 #ifndef RE_TRACK_PATTERN_OFFSETS
1075 #define Set_Node_Offset_To_R(offset,byte)
1076 #define Set_Node_Offset(node,byte)
1077 #define Set_Cur_Node_Offset
1078 #define Set_Node_Length_To_R(node,len)
1079 #define Set_Node_Length(node,len)
1080 #define Set_Node_Cur_Length(node,start)
1081 #define Node_Offset(n)
1082 #define Node_Length(n)
1083 #define Set_Node_Offset_Length(node,offset,len)
1084 #define ProgLen(ri) ri->u.proglen
1085 #define SetProgLen(ri,x) ri->u.proglen = x
1086 #define Track_Code(code)
1087 #else
1088 #define ProgLen(ri) ri->u.offsets[0]
1089 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1090 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
1091         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
1092                     __LINE__, (int)(offset), (int)(byte)));             \
1093         if((offset) < 0) {                                              \
1094             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
1095                                          (int)(offset));                \
1096         } else {                                                        \
1097             RExC_offsets[2*(offset)-1] = (byte);                        \
1098         }                                                               \
1099 } STMT_END
1100
1101 #define Set_Node_Offset(node,byte)                                      \
1102     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1103 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1104
1105 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
1106         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
1107                 __LINE__, (int)(node), (int)(len)));                    \
1108         if((node) < 0) {                                                \
1109             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1110                                          (int)(node));                  \
1111         } else {                                                        \
1112             RExC_offsets[2*(node)] = (len);                             \
1113         }                                                               \
1114 } STMT_END
1115
1116 #define Set_Node_Length(node,len) \
1117     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1118 #define Set_Node_Cur_Length(node, start)                \
1119     Set_Node_Length(node, RExC_parse - start)
1120
1121 /* Get offsets and lengths */
1122 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1123 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1124
1125 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1126     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1127     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1128 } STMT_END
1129
1130 #define Track_Code(code) STMT_START { code } STMT_END
1131 #endif
1132
1133 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1134 #define EXPERIMENTAL_INPLACESCAN
1135 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1136
1137 #ifdef DEBUGGING
1138 int
1139 Perl_re_printf(pTHX_ const char *fmt, ...)
1140 {
1141     va_list ap;
1142     int result;
1143     PerlIO *f= Perl_debug_log;
1144     PERL_ARGS_ASSERT_RE_PRINTF;
1145     va_start(ap, fmt);
1146     result = PerlIO_vprintf(f, fmt, ap);
1147     va_end(ap);
1148     return result;
1149 }
1150
1151 int
1152 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1153 {
1154     va_list ap;
1155     int result;
1156     PerlIO *f= Perl_debug_log;
1157     PERL_ARGS_ASSERT_RE_INDENTF;
1158     va_start(ap, depth);
1159     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1160     result = PerlIO_vprintf(f, fmt, ap);
1161     va_end(ap);
1162     return result;
1163 }
1164 #endif /* DEBUGGING */
1165
1166 #define DEBUG_RExC_seen()                                                   \
1167         DEBUG_OPTIMISE_MORE_r({                                             \
1168             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1169                                                                             \
1170             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1171                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1172                                                                             \
1173             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1174                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1175                                                                             \
1176             if (RExC_seen & REG_GPOS_SEEN)                                  \
1177                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1178                                                                             \
1179             if (RExC_seen & REG_RECURSE_SEEN)                               \
1180                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1181                                                                             \
1182             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1183                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1184                                                                             \
1185             if (RExC_seen & REG_VERBARG_SEEN)                               \
1186                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1187                                                                             \
1188             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1189                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1190                                                                             \
1191             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1192                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1193                                                                             \
1194             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1195                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1196                                                                             \
1197             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1198                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1199                                                                             \
1200             Perl_re_printf( aTHX_ "\n");                                    \
1201         });
1202
1203 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1204   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1205
1206
1207 #ifdef DEBUGGING
1208 static void
1209 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1210                                     const char *close_str)
1211 {
1212     if (!flags)
1213         return;
1214
1215     Perl_re_printf( aTHX_  "%s", open_str);
1216     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1217     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1218     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1219     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1220     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1221     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1222     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1223     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1224     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1225     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1226     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1227     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1228     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1229     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1230     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1231     Perl_re_printf( aTHX_  "%s", close_str);
1232 }
1233
1234
1235 static void
1236 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1237                     U32 depth, int is_inf)
1238 {
1239     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1240
1241     DEBUG_OPTIMISE_MORE_r({
1242         if (!data)
1243             return;
1244         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1245             depth,
1246             where,
1247             (IV)data->pos_min,
1248             (IV)data->pos_delta,
1249             (UV)data->flags
1250         );
1251
1252         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1253
1254         Perl_re_printf( aTHX_
1255             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1256             (IV)data->whilem_c,
1257             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1258             is_inf ? "INF " : ""
1259         );
1260
1261         if (data->last_found) {
1262             int i;
1263             Perl_re_printf(aTHX_
1264                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1265                     SvPVX_const(data->last_found),
1266                     (IV)data->last_end,
1267                     (IV)data->last_start_min,
1268                     (IV)data->last_start_max
1269             );
1270
1271             for (i = 0; i < 2; i++) {
1272                 Perl_re_printf(aTHX_
1273                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1274                     data->cur_is_floating == i ? "*" : "",
1275                     i ? "Float" : "Fixed",
1276                     SvPVX_const(data->substrs[i].str),
1277                     (IV)data->substrs[i].min_offset,
1278                     (IV)data->substrs[i].max_offset
1279                 );
1280                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1281             }
1282         }
1283
1284         Perl_re_printf( aTHX_ "\n");
1285     });
1286 }
1287
1288
1289 static void
1290 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1291                 regnode *scan, U32 depth, U32 flags)
1292 {
1293     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1294
1295     DEBUG_OPTIMISE_r({
1296         regnode *Next;
1297
1298         if (!scan)
1299             return;
1300         Next = regnext(scan);
1301         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1302         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1303             depth,
1304             str,
1305             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1306             Next ? (REG_NODE_NUM(Next)) : 0 );
1307         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1308         Perl_re_printf( aTHX_  "\n");
1309    });
1310 }
1311
1312
1313 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1314                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1315
1316 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1317                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1318
1319 #else
1320 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1321 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1322 #endif
1323
1324
1325 /* =========================================================
1326  * BEGIN edit_distance stuff.
1327  *
1328  * This calculates how many single character changes of any type are needed to
1329  * transform a string into another one.  It is taken from version 3.1 of
1330  *
1331  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1332  */
1333
1334 /* Our unsorted dictionary linked list.   */
1335 /* Note we use UVs, not chars. */
1336
1337 struct dictionary{
1338   UV key;
1339   UV value;
1340   struct dictionary* next;
1341 };
1342 typedef struct dictionary item;
1343
1344
1345 PERL_STATIC_INLINE item*
1346 push(UV key, item* curr)
1347 {
1348     item* head;
1349     Newx(head, 1, item);
1350     head->key = key;
1351     head->value = 0;
1352     head->next = curr;
1353     return head;
1354 }
1355
1356
1357 PERL_STATIC_INLINE item*
1358 find(item* head, UV key)
1359 {
1360     item* iterator = head;
1361     while (iterator){
1362         if (iterator->key == key){
1363             return iterator;
1364         }
1365         iterator = iterator->next;
1366     }
1367
1368     return NULL;
1369 }
1370
1371 PERL_STATIC_INLINE item*
1372 uniquePush(item* head, UV key)
1373 {
1374     item* iterator = head;
1375
1376     while (iterator){
1377         if (iterator->key == key) {
1378             return head;
1379         }
1380         iterator = iterator->next;
1381     }
1382
1383     return push(key, head);
1384 }
1385
1386 PERL_STATIC_INLINE void
1387 dict_free(item* head)
1388 {
1389     item* iterator = head;
1390
1391     while (iterator) {
1392         item* temp = iterator;
1393         iterator = iterator->next;
1394         Safefree(temp);
1395     }
1396
1397     head = NULL;
1398 }
1399
1400 /* End of Dictionary Stuff */
1401
1402 /* All calculations/work are done here */
1403 STATIC int
1404 S_edit_distance(const UV* src,
1405                 const UV* tgt,
1406                 const STRLEN x,             /* length of src[] */
1407                 const STRLEN y,             /* length of tgt[] */
1408                 const SSize_t maxDistance
1409 )
1410 {
1411     item *head = NULL;
1412     UV swapCount, swapScore, targetCharCount, i, j;
1413     UV *scores;
1414     UV score_ceil = x + y;
1415
1416     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1417
1418     /* intialize matrix start values */
1419     Newx(scores, ( (x + 2) * (y + 2)), UV);
1420     scores[0] = score_ceil;
1421     scores[1 * (y + 2) + 0] = score_ceil;
1422     scores[0 * (y + 2) + 1] = score_ceil;
1423     scores[1 * (y + 2) + 1] = 0;
1424     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1425
1426     /* work loops    */
1427     /* i = src index */
1428     /* j = tgt index */
1429     for (i=1;i<=x;i++) {
1430         if (i < x)
1431             head = uniquePush(head, src[i]);
1432         scores[(i+1) * (y + 2) + 1] = i;
1433         scores[(i+1) * (y + 2) + 0] = score_ceil;
1434         swapCount = 0;
1435
1436         for (j=1;j<=y;j++) {
1437             if (i == 1) {
1438                 if(j < y)
1439                 head = uniquePush(head, tgt[j]);
1440                 scores[1 * (y + 2) + (j + 1)] = j;
1441                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1442             }
1443
1444             targetCharCount = find(head, tgt[j-1])->value;
1445             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1446
1447             if (src[i-1] != tgt[j-1]){
1448                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1449             }
1450             else {
1451                 swapCount = j;
1452                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1453             }
1454         }
1455
1456         find(head, src[i-1])->value = i;
1457     }
1458
1459     {
1460         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1461         dict_free(head);
1462         Safefree(scores);
1463         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1464     }
1465 }
1466
1467 /* END of edit_distance() stuff
1468  * ========================================================= */
1469
1470 /* Mark that we cannot extend a found fixed substring at this point.
1471    Update the longest found anchored substring or the longest found
1472    floating substrings if needed. */
1473
1474 STATIC void
1475 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1476                     SSize_t *minlenp, int is_inf)
1477 {
1478     const STRLEN l = CHR_SVLEN(data->last_found);
1479     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1480     const STRLEN old_l = CHR_SVLEN(longest_sv);
1481     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1482
1483     PERL_ARGS_ASSERT_SCAN_COMMIT;
1484
1485     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1486         const U8 i = data->cur_is_floating;
1487         SvSetMagicSV(longest_sv, data->last_found);
1488         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1489
1490         if (!i) /* fixed */
1491             data->substrs[0].max_offset = data->substrs[0].min_offset;
1492         else { /* float */
1493             data->substrs[1].max_offset =
1494                       (is_inf)
1495                        ? OPTIMIZE_INFTY
1496                        : (l
1497                           ? data->last_start_max
1498                           /* temporary underflow guard for 5.32 */
1499                           : data->pos_delta < 0 ? OPTIMIZE_INFTY
1500                           : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1501                                          ? OPTIMIZE_INFTY
1502                                          : data->pos_min + data->pos_delta));
1503         }
1504
1505         data->substrs[i].flags &= ~SF_BEFORE_EOL;
1506         data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1507         data->substrs[i].minlenp = minlenp;
1508         data->substrs[i].lookbehind = 0;
1509     }
1510
1511     SvCUR_set(data->last_found, 0);
1512     {
1513         SV * const sv = data->last_found;
1514         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1515             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1516             if (mg)
1517                 mg->mg_len = 0;
1518         }
1519     }
1520     data->last_end = -1;
1521     data->flags &= ~SF_BEFORE_EOL;
1522     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1523 }
1524
1525 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1526  * list that describes which code points it matches */
1527
1528 STATIC void
1529 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1530 {
1531     /* Set the SSC 'ssc' to match an empty string or any code point */
1532
1533     PERL_ARGS_ASSERT_SSC_ANYTHING;
1534
1535     assert(is_ANYOF_SYNTHETIC(ssc));
1536
1537     /* mortalize so won't leak */
1538     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1539     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1540 }
1541
1542 STATIC int
1543 S_ssc_is_anything(const regnode_ssc *ssc)
1544 {
1545     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1546      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1547      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1548      * in any way, so there's no point in using it */
1549
1550     UV start, end;
1551     bool ret;
1552
1553     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1554
1555     assert(is_ANYOF_SYNTHETIC(ssc));
1556
1557     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1558         return FALSE;
1559     }
1560
1561     /* See if the list consists solely of the range 0 - Infinity */
1562     invlist_iterinit(ssc->invlist);
1563     ret = invlist_iternext(ssc->invlist, &start, &end)
1564           && start == 0
1565           && end == UV_MAX;
1566
1567     invlist_iterfinish(ssc->invlist);
1568
1569     if (ret) {
1570         return TRUE;
1571     }
1572
1573     /* If e.g., both \w and \W are set, matches everything */
1574     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1575         int i;
1576         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1577             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1578                 return TRUE;
1579             }
1580         }
1581     }
1582
1583     return FALSE;
1584 }
1585
1586 STATIC void
1587 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1588 {
1589     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1590      * string, any code point, or any posix class under locale */
1591
1592     PERL_ARGS_ASSERT_SSC_INIT;
1593
1594     Zero(ssc, 1, regnode_ssc);
1595     set_ANYOF_SYNTHETIC(ssc);
1596     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1597     ssc_anything(ssc);
1598
1599     /* If any portion of the regex is to operate under locale rules that aren't
1600      * fully known at compile time, initialization includes it.  The reason
1601      * this isn't done for all regexes is that the optimizer was written under
1602      * the assumption that locale was all-or-nothing.  Given the complexity and
1603      * lack of documentation in the optimizer, and that there are inadequate
1604      * test cases for locale, many parts of it may not work properly, it is
1605      * safest to avoid locale unless necessary. */
1606     if (RExC_contains_locale) {
1607         ANYOF_POSIXL_SETALL(ssc);
1608     }
1609     else {
1610         ANYOF_POSIXL_ZERO(ssc);
1611     }
1612 }
1613
1614 STATIC int
1615 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1616                         const regnode_ssc *ssc)
1617 {
1618     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1619      * to the list of code points matched, and locale posix classes; hence does
1620      * not check its flags) */
1621
1622     UV start, end;
1623     bool ret;
1624
1625     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1626
1627     assert(is_ANYOF_SYNTHETIC(ssc));
1628
1629     invlist_iterinit(ssc->invlist);
1630     ret = invlist_iternext(ssc->invlist, &start, &end)
1631           && start == 0
1632           && end == UV_MAX;
1633
1634     invlist_iterfinish(ssc->invlist);
1635
1636     if (! ret) {
1637         return FALSE;
1638     }
1639
1640     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1641         return FALSE;
1642     }
1643
1644     return TRUE;
1645 }
1646
1647 #define INVLIST_INDEX 0
1648 #define ONLY_LOCALE_MATCHES_INDEX 1
1649 #define DEFERRED_USER_DEFINED_INDEX 2
1650
1651 STATIC SV*
1652 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1653                                const regnode_charclass* const node)
1654 {
1655     /* Returns a mortal inversion list defining which code points are matched
1656      * by 'node', which is of type ANYOF.  Handles complementing the result if
1657      * appropriate.  If some code points aren't knowable at this time, the
1658      * returned list must, and will, contain every code point that is a
1659      * possibility. */
1660
1661     SV* invlist = NULL;
1662     SV* only_utf8_locale_invlist = NULL;
1663     unsigned int i;
1664     const U32 n = ARG(node);
1665     bool new_node_has_latin1 = FALSE;
1666     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1667                       ? 0
1668                       : ANYOF_FLAGS(node);
1669
1670     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1671
1672     /* Look at the data structure created by S_set_ANYOF_arg() */
1673     if (n != ANYOF_ONLY_HAS_BITMAP) {
1674         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1675         AV * const av = MUTABLE_AV(SvRV(rv));
1676         SV **const ary = AvARRAY(av);
1677         assert(RExC_rxi->data->what[n] == 's');
1678
1679         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1680
1681             /* Here there are things that won't be known until runtime -- we
1682              * have to assume it could be anything */
1683             invlist = sv_2mortal(_new_invlist(1));
1684             return _add_range_to_invlist(invlist, 0, UV_MAX);
1685         }
1686         else if (ary[INVLIST_INDEX]) {
1687
1688             /* Use the node's inversion list */
1689             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1690         }
1691
1692         /* Get the code points valid only under UTF-8 locales */
1693         if (   (flags & ANYOFL_FOLD)
1694             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1695         {
1696             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1697         }
1698     }
1699
1700     if (! invlist) {
1701         invlist = sv_2mortal(_new_invlist(0));
1702     }
1703
1704     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1705      * code points, and an inversion list for the others, but if there are code
1706      * points that should match only conditionally on the target string being
1707      * UTF-8, those are placed in the inversion list, and not the bitmap.
1708      * Since there are circumstances under which they could match, they are
1709      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1710      * to exclude them here, so that when we invert below, the end result
1711      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1712      * have to do this here before we add the unconditionally matched code
1713      * points */
1714     if (flags & ANYOF_INVERT) {
1715         _invlist_intersection_complement_2nd(invlist,
1716                                              PL_UpperLatin1,
1717                                              &invlist);
1718     }
1719
1720     /* Add in the points from the bit map */
1721     if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1722         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1723             if (ANYOF_BITMAP_TEST(node, i)) {
1724                 unsigned int start = i++;
1725
1726                 for (;    i < NUM_ANYOF_CODE_POINTS
1727                        && ANYOF_BITMAP_TEST(node, i); ++i)
1728                 {
1729                     /* empty */
1730                 }
1731                 invlist = _add_range_to_invlist(invlist, start, i-1);
1732                 new_node_has_latin1 = TRUE;
1733             }
1734         }
1735     }
1736
1737     /* If this can match all upper Latin1 code points, have to add them
1738      * as well.  But don't add them if inverting, as when that gets done below,
1739      * it would exclude all these characters, including the ones it shouldn't
1740      * that were added just above */
1741     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1742         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1743     {
1744         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1745     }
1746
1747     /* Similarly for these */
1748     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1749         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1750     }
1751
1752     if (flags & ANYOF_INVERT) {
1753         _invlist_invert(invlist);
1754     }
1755     else if (flags & ANYOFL_FOLD) {
1756         if (new_node_has_latin1) {
1757
1758             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1759              * the locale.  We can skip this if there are no 0-255 at all. */
1760             _invlist_union(invlist, PL_Latin1, &invlist);
1761
1762             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1763             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1764         }
1765         else {
1766             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1767                 invlist = add_cp_to_invlist(invlist, 'I');
1768             }
1769             if (_invlist_contains_cp(invlist,
1770                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1771             {
1772                 invlist = add_cp_to_invlist(invlist, 'i');
1773             }
1774         }
1775     }
1776
1777     /* Similarly add the UTF-8 locale possible matches.  These have to be
1778      * deferred until after the non-UTF-8 locale ones are taken care of just
1779      * above, or it leads to wrong results under ANYOF_INVERT */
1780     if (only_utf8_locale_invlist) {
1781         _invlist_union_maybe_complement_2nd(invlist,
1782                                             only_utf8_locale_invlist,
1783                                             flags & ANYOF_INVERT,
1784                                             &invlist);
1785     }
1786
1787     return invlist;
1788 }
1789
1790 /* These two functions currently do the exact same thing */
1791 #define ssc_init_zero           ssc_init
1792
1793 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1794 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1795
1796 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1797  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1798  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1799
1800 STATIC void
1801 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1802                 const regnode_charclass *and_with)
1803 {
1804     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1805      * another SSC or a regular ANYOF class.  Can create false positives. */
1806
1807     SV* anded_cp_list;
1808     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1809                           ? 0
1810                           : ANYOF_FLAGS(and_with);
1811     U8  anded_flags;
1812
1813     PERL_ARGS_ASSERT_SSC_AND;
1814
1815     assert(is_ANYOF_SYNTHETIC(ssc));
1816
1817     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1818      * the code point inversion list and just the relevant flags */
1819     if (is_ANYOF_SYNTHETIC(and_with)) {
1820         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1821         anded_flags = and_with_flags;
1822
1823         /* XXX This is a kludge around what appears to be deficiencies in the
1824          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1825          * there are paths through the optimizer where it doesn't get weeded
1826          * out when it should.  And if we don't make some extra provision for
1827          * it like the code just below, it doesn't get added when it should.
1828          * This solution is to add it only when AND'ing, which is here, and
1829          * only when what is being AND'ed is the pristine, original node
1830          * matching anything.  Thus it is like adding it to ssc_anything() but
1831          * only when the result is to be AND'ed.  Probably the same solution
1832          * could be adopted for the same problem we have with /l matching,
1833          * which is solved differently in S_ssc_init(), and that would lead to
1834          * fewer false positives than that solution has.  But if this solution
1835          * creates bugs, the consequences are only that a warning isn't raised
1836          * that should be; while the consequences for having /l bugs is
1837          * incorrect matches */
1838         if (ssc_is_anything((regnode_ssc *)and_with)) {
1839             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1840         }
1841     }
1842     else {
1843         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1844         if (OP(and_with) == ANYOFD) {
1845             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1846         }
1847         else {
1848             anded_flags = and_with_flags
1849             &( ANYOF_COMMON_FLAGS
1850               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1851               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1852             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1853                 anded_flags &=
1854                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1855             }
1856         }
1857     }
1858
1859     ANYOF_FLAGS(ssc) &= anded_flags;
1860
1861     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1862      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1863      * 'and_with' may be inverted.  When not inverted, we have the situation of
1864      * computing:
1865      *  (C1 | P1) & (C2 | P2)
1866      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1867      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1868      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1869      *                    <=  ((C1 & C2) | P1 | P2)
1870      * Alternatively, the last few steps could be:
1871      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1872      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1873      *                    <=  (C1 | C2 | (P1 & P2))
1874      * We favor the second approach if either P1 or P2 is non-empty.  This is
1875      * because these components are a barrier to doing optimizations, as what
1876      * they match cannot be known until the moment of matching as they are
1877      * dependent on the current locale, 'AND"ing them likely will reduce or
1878      * eliminate them.
1879      * But we can do better if we know that C1,P1 are in their initial state (a
1880      * frequent occurrence), each matching everything:
1881      *  (<everything>) & (C2 | P2) =  C2 | P2
1882      * Similarly, if C2,P2 are in their initial state (again a frequent
1883      * occurrence), the result is a no-op
1884      *  (C1 | P1) & (<everything>) =  C1 | P1
1885      *
1886      * Inverted, we have
1887      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1888      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1889      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1890      * */
1891
1892     if ((and_with_flags & ANYOF_INVERT)
1893         && ! is_ANYOF_SYNTHETIC(and_with))
1894     {
1895         unsigned int i;
1896
1897         ssc_intersection(ssc,
1898                          anded_cp_list,
1899                          FALSE /* Has already been inverted */
1900                          );
1901
1902         /* If either P1 or P2 is empty, the intersection will be also; can skip
1903          * the loop */
1904         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1905             ANYOF_POSIXL_ZERO(ssc);
1906         }
1907         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1908
1909             /* Note that the Posix class component P from 'and_with' actually
1910              * looks like:
1911              *      P = Pa | Pb | ... | Pn
1912              * where each component is one posix class, such as in [\w\s].
1913              * Thus
1914              *      ~P = ~(Pa | Pb | ... | Pn)
1915              *         = ~Pa & ~Pb & ... & ~Pn
1916              *        <= ~Pa | ~Pb | ... | ~Pn
1917              * The last is something we can easily calculate, but unfortunately
1918              * is likely to have many false positives.  We could do better
1919              * in some (but certainly not all) instances if two classes in
1920              * P have known relationships.  For example
1921              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1922              * So
1923              *      :lower: & :print: = :lower:
1924              * And similarly for classes that must be disjoint.  For example,
1925              * since \s and \w can have no elements in common based on rules in
1926              * the POSIX standard,
1927              *      \w & ^\S = nothing
1928              * Unfortunately, some vendor locales do not meet the Posix
1929              * standard, in particular almost everything by Microsoft.
1930              * The loop below just changes e.g., \w into \W and vice versa */
1931
1932             regnode_charclass_posixl temp;
1933             int add = 1;    /* To calculate the index of the complement */
1934
1935             Zero(&temp, 1, regnode_charclass_posixl);
1936             ANYOF_POSIXL_ZERO(&temp);
1937             for (i = 0; i < ANYOF_MAX; i++) {
1938                 assert(i % 2 != 0
1939                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1940                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1941
1942                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1943                     ANYOF_POSIXL_SET(&temp, i + add);
1944                 }
1945                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1946             }
1947             ANYOF_POSIXL_AND(&temp, ssc);
1948
1949         } /* else ssc already has no posixes */
1950     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1951          in its initial state */
1952     else if (! is_ANYOF_SYNTHETIC(and_with)
1953              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1954     {
1955         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1956          * copy it over 'ssc' */
1957         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1958             if (is_ANYOF_SYNTHETIC(and_with)) {
1959                 StructCopy(and_with, ssc, regnode_ssc);
1960             }
1961             else {
1962                 ssc->invlist = anded_cp_list;
1963                 ANYOF_POSIXL_ZERO(ssc);
1964                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1965                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1966                 }
1967             }
1968         }
1969         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1970                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1971         {
1972             /* One or the other of P1, P2 is non-empty. */
1973             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1974                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1975             }
1976             ssc_union(ssc, anded_cp_list, FALSE);
1977         }
1978         else { /* P1 = P2 = empty */
1979             ssc_intersection(ssc, anded_cp_list, FALSE);
1980         }
1981     }
1982 }
1983
1984 STATIC void
1985 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1986                const regnode_charclass *or_with)
1987 {
1988     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1989      * another SSC or a regular ANYOF class.  Can create false positives if
1990      * 'or_with' is to be inverted. */
1991
1992     SV* ored_cp_list;
1993     U8 ored_flags;
1994     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1995                          ? 0
1996                          : ANYOF_FLAGS(or_with);
1997
1998     PERL_ARGS_ASSERT_SSC_OR;
1999
2000     assert(is_ANYOF_SYNTHETIC(ssc));
2001
2002     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2003      * the code point inversion list and just the relevant flags */
2004     if (is_ANYOF_SYNTHETIC(or_with)) {
2005         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2006         ored_flags = or_with_flags;
2007     }
2008     else {
2009         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2010         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2011         if (OP(or_with) != ANYOFD) {
2012             ored_flags
2013             |= or_with_flags
2014              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2015                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2016             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2017                 ored_flags |=
2018                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2019             }
2020         }
2021     }
2022
2023     ANYOF_FLAGS(ssc) |= ored_flags;
2024
2025     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2026      * C2 is the list of code points in 'or-with'; P2, its posix classes.
2027      * 'or_with' may be inverted.  When not inverted, we have the simple
2028      * situation of computing:
2029      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
2030      * If P1|P2 yields a situation with both a class and its complement are
2031      * set, like having both \w and \W, this matches all code points, and we
2032      * can delete these from the P component of the ssc going forward.  XXX We
2033      * might be able to delete all the P components, but I (khw) am not certain
2034      * about this, and it is better to be safe.
2035      *
2036      * Inverted, we have
2037      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
2038      *                         <=  (C1 | P1) | ~C2
2039      *                         <=  (C1 | ~C2) | P1
2040      * (which results in actually simpler code than the non-inverted case)
2041      * */
2042
2043     if ((or_with_flags & ANYOF_INVERT)
2044         && ! is_ANYOF_SYNTHETIC(or_with))
2045     {
2046         /* We ignore P2, leaving P1 going forward */
2047     }   /* else  Not inverted */
2048     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2049         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2050         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2051             unsigned int i;
2052             for (i = 0; i < ANYOF_MAX; i += 2) {
2053                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2054                 {
2055                     ssc_match_all_cp(ssc);
2056                     ANYOF_POSIXL_CLEAR(ssc, i);
2057                     ANYOF_POSIXL_CLEAR(ssc, i+1);
2058                 }
2059             }
2060         }
2061     }
2062
2063     ssc_union(ssc,
2064               ored_cp_list,
2065               FALSE /* Already has been inverted */
2066               );
2067 }
2068
2069 STATIC void
2070 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2071 {
2072     PERL_ARGS_ASSERT_SSC_UNION;
2073
2074     assert(is_ANYOF_SYNTHETIC(ssc));
2075
2076     _invlist_union_maybe_complement_2nd(ssc->invlist,
2077                                         invlist,
2078                                         invert2nd,
2079                                         &ssc->invlist);
2080 }
2081
2082 STATIC void
2083 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2084                          SV* const invlist,
2085                          const bool invert2nd)
2086 {
2087     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2088
2089     assert(is_ANYOF_SYNTHETIC(ssc));
2090
2091     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2092                                                invlist,
2093                                                invert2nd,
2094                                                &ssc->invlist);
2095 }
2096
2097 STATIC void
2098 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2099 {
2100     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2101
2102     assert(is_ANYOF_SYNTHETIC(ssc));
2103
2104     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2105 }
2106
2107 STATIC void
2108 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2109 {
2110     /* AND just the single code point 'cp' into the SSC 'ssc' */
2111
2112     SV* cp_list = _new_invlist(2);
2113
2114     PERL_ARGS_ASSERT_SSC_CP_AND;
2115
2116     assert(is_ANYOF_SYNTHETIC(ssc));
2117
2118     cp_list = add_cp_to_invlist(cp_list, cp);
2119     ssc_intersection(ssc, cp_list,
2120                      FALSE /* Not inverted */
2121                      );
2122     SvREFCNT_dec_NN(cp_list);
2123 }
2124
2125 STATIC void
2126 S_ssc_clear_locale(regnode_ssc *ssc)
2127 {
2128     /* Set the SSC 'ssc' to not match any locale things */
2129     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2130
2131     assert(is_ANYOF_SYNTHETIC(ssc));
2132
2133     ANYOF_POSIXL_ZERO(ssc);
2134     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2135 }
2136
2137 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2138
2139 STATIC bool
2140 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2141 {
2142     /* The synthetic start class is used to hopefully quickly winnow down
2143      * places where a pattern could start a match in the target string.  If it
2144      * doesn't really narrow things down that much, there isn't much point to
2145      * having the overhead of using it.  This function uses some very crude
2146      * heuristics to decide if to use the ssc or not.
2147      *
2148      * It returns TRUE if 'ssc' rules out more than half what it considers to
2149      * be the "likely" possible matches, but of course it doesn't know what the
2150      * actual things being matched are going to be; these are only guesses
2151      *
2152      * For /l matches, it assumes that the only likely matches are going to be
2153      *      in the 0-255 range, uniformly distributed, so half of that is 127
2154      * For /a and /d matches, it assumes that the likely matches will be just
2155      *      the ASCII range, so half of that is 63
2156      * For /u and there isn't anything matching above the Latin1 range, it
2157      *      assumes that that is the only range likely to be matched, and uses
2158      *      half that as the cut-off: 127.  If anything matches above Latin1,
2159      *      it assumes that all of Unicode could match (uniformly), except for
2160      *      non-Unicode code points and things in the General Category "Other"
2161      *      (unassigned, private use, surrogates, controls and formats).  This
2162      *      is a much large number. */
2163
2164     U32 count = 0;      /* Running total of number of code points matched by
2165                            'ssc' */
2166     UV start, end;      /* Start and end points of current range in inversion
2167                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2168     const U32 max_code_points = (LOC)
2169                                 ?  256
2170                                 : ((  ! UNI_SEMANTICS
2171                                     ||  invlist_highest(ssc->invlist) < 256)
2172                                   ? 128
2173                                   : NON_OTHER_COUNT);
2174     const U32 max_match = max_code_points / 2;
2175
2176     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2177
2178     invlist_iterinit(ssc->invlist);
2179     while (invlist_iternext(ssc->invlist, &start, &end)) {
2180         if (start >= max_code_points) {
2181             break;
2182         }
2183         end = MIN(end, max_code_points - 1);
2184         count += end - start + 1;
2185         if (count >= max_match) {
2186             invlist_iterfinish(ssc->invlist);
2187             return FALSE;
2188         }
2189     }
2190
2191     return TRUE;
2192 }
2193
2194
2195 STATIC void
2196 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2197 {
2198     /* The inversion list in the SSC is marked mortal; now we need a more
2199      * permanent copy, which is stored the same way that is done in a regular
2200      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2201      * map */
2202
2203     SV* invlist = invlist_clone(ssc->invlist, NULL);
2204
2205     PERL_ARGS_ASSERT_SSC_FINALIZE;
2206
2207     assert(is_ANYOF_SYNTHETIC(ssc));
2208
2209     /* The code in this file assumes that all but these flags aren't relevant
2210      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2211      * by the time we reach here */
2212     assert(! (ANYOF_FLAGS(ssc)
2213         & ~( ANYOF_COMMON_FLAGS
2214             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2215             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2216
2217     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2218
2219     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2220     SvREFCNT_dec(invlist);
2221
2222     /* Make sure is clone-safe */
2223     ssc->invlist = NULL;
2224
2225     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2226         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2227         OP(ssc) = ANYOFPOSIXL;
2228     }
2229     else if (RExC_contains_locale) {
2230         OP(ssc) = ANYOFL;
2231     }
2232
2233     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2234 }
2235
2236 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2237 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2238 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2239 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2240                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2241                                : 0 )
2242
2243
2244 #ifdef DEBUGGING
2245 /*
2246    dump_trie(trie,widecharmap,revcharmap)
2247    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2248    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2249
2250    These routines dump out a trie in a somewhat readable format.
2251    The _interim_ variants are used for debugging the interim
2252    tables that are used to generate the final compressed
2253    representation which is what dump_trie expects.
2254
2255    Part of the reason for their existence is to provide a form
2256    of documentation as to how the different representations function.
2257
2258 */
2259
2260 /*
2261   Dumps the final compressed table form of the trie to Perl_debug_log.
2262   Used for debugging make_trie().
2263 */
2264
2265 STATIC void
2266 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2267             AV *revcharmap, U32 depth)
2268 {
2269     U32 state;
2270     SV *sv=sv_newmortal();
2271     int colwidth= widecharmap ? 6 : 4;
2272     U16 word;
2273     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2274
2275     PERL_ARGS_ASSERT_DUMP_TRIE;
2276
2277     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2278         depth+1, "Match","Base","Ofs" );
2279
2280     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2281         SV ** const tmp = av_fetch( revcharmap, state, 0);
2282         if ( tmp ) {
2283             Perl_re_printf( aTHX_  "%*s",
2284                 colwidth,
2285                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2286                             PL_colors[0], PL_colors[1],
2287                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2288                             PERL_PV_ESCAPE_FIRSTCHAR
2289                 )
2290             );
2291         }
2292     }
2293     Perl_re_printf( aTHX_  "\n");
2294     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2295
2296     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2297         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2298     Perl_re_printf( aTHX_  "\n");
2299
2300     for( state = 1 ; state < trie->statecount ; state++ ) {
2301         const U32 base = trie->states[ state ].trans.base;
2302
2303         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2304
2305         if ( trie->states[ state ].wordnum ) {
2306             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2307         } else {
2308             Perl_re_printf( aTHX_  "%6s", "" );
2309         }
2310
2311         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2312
2313         if ( base ) {
2314             U32 ofs = 0;
2315
2316             while( ( base + ofs  < trie->uniquecharcount ) ||
2317                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2318                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2319                                                                     != state))
2320                     ofs++;
2321
2322             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2323
2324             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2325                 if ( ( base + ofs >= trie->uniquecharcount )
2326                         && ( base + ofs - trie->uniquecharcount
2327                                                         < trie->lasttrans )
2328                         && trie->trans[ base + ofs
2329                                     - trie->uniquecharcount ].check == state )
2330                 {
2331                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2332                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2333                    );
2334                 } else {
2335                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2336                 }
2337             }
2338
2339             Perl_re_printf( aTHX_  "]");
2340
2341         }
2342         Perl_re_printf( aTHX_  "\n" );
2343     }
2344     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2345                                 depth);
2346     for (word=1; word <= trie->wordcount; word++) {
2347         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2348             (int)word, (int)(trie->wordinfo[word].prev),
2349             (int)(trie->wordinfo[word].len));
2350     }
2351     Perl_re_printf( aTHX_  "\n" );
2352 }
2353 /*
2354   Dumps a fully constructed but uncompressed trie in list form.
2355   List tries normally only are used for construction when the number of
2356   possible chars (trie->uniquecharcount) is very high.
2357   Used for debugging make_trie().
2358 */
2359 STATIC void
2360 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2361                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2362                          U32 depth)
2363 {
2364     U32 state;
2365     SV *sv=sv_newmortal();
2366     int colwidth= widecharmap ? 6 : 4;
2367     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2368
2369     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2370
2371     /* print out the table precompression.  */
2372     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2373             depth+1 );
2374     Perl_re_indentf( aTHX_  "%s",
2375             depth+1, "------:-----+-----------------\n" );
2376
2377     for( state=1 ; state < next_alloc ; state ++ ) {
2378         U16 charid;
2379
2380         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2381             depth+1, (UV)state  );
2382         if ( ! trie->states[ state ].wordnum ) {
2383             Perl_re_printf( aTHX_  "%5s| ","");
2384         } else {
2385             Perl_re_printf( aTHX_  "W%4x| ",
2386                 trie->states[ state ].wordnum
2387             );
2388         }
2389         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2390             SV ** const tmp = av_fetch( revcharmap,
2391                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2392             if ( tmp ) {
2393                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2394                     colwidth,
2395                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2396                               colwidth,
2397                               PL_colors[0], PL_colors[1],
2398                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2399                               | PERL_PV_ESCAPE_FIRSTCHAR
2400                     ) ,
2401                     TRIE_LIST_ITEM(state, charid).forid,
2402                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2403                 );
2404                 if (!(charid % 10))
2405                     Perl_re_printf( aTHX_  "\n%*s| ",
2406                         (int)((depth * 2) + 14), "");
2407             }
2408         }
2409         Perl_re_printf( aTHX_  "\n");
2410     }
2411 }
2412
2413 /*
2414   Dumps a fully constructed but uncompressed trie in table form.
2415   This is the normal DFA style state transition table, with a few
2416   twists to facilitate compression later.
2417   Used for debugging make_trie().
2418 */
2419 STATIC void
2420 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2421                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2422                           U32 depth)
2423 {
2424     U32 state;
2425     U16 charid;
2426     SV *sv=sv_newmortal();
2427     int colwidth= widecharmap ? 6 : 4;
2428     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2429
2430     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2431
2432     /*
2433        print out the table precompression so that we can do a visual check
2434        that they are identical.
2435      */
2436
2437     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2438
2439     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2440         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2441         if ( tmp ) {
2442             Perl_re_printf( aTHX_  "%*s",
2443                 colwidth,
2444                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2445                             PL_colors[0], PL_colors[1],
2446                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2447                             PERL_PV_ESCAPE_FIRSTCHAR
2448                 )
2449             );
2450         }
2451     }
2452
2453     Perl_re_printf( aTHX_ "\n");
2454     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2455
2456     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2457         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2458     }
2459
2460     Perl_re_printf( aTHX_  "\n" );
2461
2462     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2463
2464         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2465             depth+1,
2466             (UV)TRIE_NODENUM( state ) );
2467
2468         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2469             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2470             if (v)
2471                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2472             else
2473                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2474         }
2475         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2476             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2477                                             (UV)trie->trans[ state ].check );
2478         } else {
2479             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2480                                             (UV)trie->trans[ state ].check,
2481             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2482         }
2483     }
2484 }
2485
2486 #endif
2487
2488
2489 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2490   startbranch: the first branch in the whole branch sequence
2491   first      : start branch of sequence of branch-exact nodes.
2492                May be the same as startbranch
2493   last       : Thing following the last branch.
2494                May be the same as tail.
2495   tail       : item following the branch sequence
2496   count      : words in the sequence
2497   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2498   depth      : indent depth
2499
2500 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2501
2502 A trie is an N'ary tree where the branches are determined by digital
2503 decomposition of the key. IE, at the root node you look up the 1st character and
2504 follow that branch repeat until you find the end of the branches. Nodes can be
2505 marked as "accepting" meaning they represent a complete word. Eg:
2506
2507   /he|she|his|hers/
2508
2509 would convert into the following structure. Numbers represent states, letters
2510 following numbers represent valid transitions on the letter from that state, if
2511 the number is in square brackets it represents an accepting state, otherwise it
2512 will be in parenthesis.
2513
2514       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2515       |    |
2516       |   (2)
2517       |    |
2518      (1)   +-i->(6)-+-s->[7]
2519       |
2520       +-s->(3)-+-h->(4)-+-e->[5]
2521
2522       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2523
2524 This shows that when matching against the string 'hers' we will begin at state 1
2525 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2526 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2527 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2528 single traverse. We store a mapping from accepting to state to which word was
2529 matched, and then when we have multiple possibilities we try to complete the
2530 rest of the regex in the order in which they occurred in the alternation.
2531
2532 The only prior NFA like behaviour that would be changed by the TRIE support is
2533 the silent ignoring of duplicate alternations which are of the form:
2534
2535  / (DUPE|DUPE) X? (?{ ... }) Y /x
2536
2537 Thus EVAL blocks following a trie may be called a different number of times with
2538 and without the optimisation. With the optimisations dupes will be silently
2539 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2540 the following demonstrates:
2541
2542  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2543
2544 which prints out 'word' three times, but
2545
2546  'words'=~/(word|word|word)(?{ print $1 })S/
2547
2548 which doesnt print it out at all. This is due to other optimisations kicking in.
2549
2550 Example of what happens on a structural level:
2551
2552 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2553
2554    1: CURLYM[1] {1,32767}(18)
2555    5:   BRANCH(8)
2556    6:     EXACT <ac>(16)
2557    8:   BRANCH(11)
2558    9:     EXACT <ad>(16)
2559   11:   BRANCH(14)
2560   12:     EXACT <ab>(16)
2561   16:   SUCCEED(0)
2562   17:   NOTHING(18)
2563   18: END(0)
2564
2565 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2566 and should turn into:
2567
2568    1: CURLYM[1] {1,32767}(18)
2569    5:   TRIE(16)
2570         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2571           <ac>
2572           <ad>
2573           <ab>
2574   16:   SUCCEED(0)
2575   17:   NOTHING(18)
2576   18: END(0)
2577
2578 Cases where tail != last would be like /(?foo|bar)baz/:
2579
2580    1: BRANCH(4)
2581    2:   EXACT <foo>(8)
2582    4: BRANCH(7)
2583    5:   EXACT <bar>(8)
2584    7: TAIL(8)
2585    8: EXACT <baz>(10)
2586   10: END(0)
2587
2588 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2589 and would end up looking like:
2590
2591     1: TRIE(8)
2592       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2593         <foo>
2594         <bar>
2595    7: TAIL(8)
2596    8: EXACT <baz>(10)
2597   10: END(0)
2598
2599     d = uvchr_to_utf8_flags(d, uv, 0);
2600
2601 is the recommended Unicode-aware way of saying
2602
2603     *(d++) = uv;
2604 */
2605
2606 #define TRIE_STORE_REVCHAR(val)                                            \
2607     STMT_START {                                                           \
2608         if (UTF) {                                                         \
2609             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2610             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2611             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2612             *kapow = '\0';                                                 \
2613             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2614             SvPOK_on(zlopp);                                               \
2615             SvUTF8_on(zlopp);                                              \
2616             av_push(revcharmap, zlopp);                                    \
2617         } else {                                                           \
2618             char ooooff = (char)val;                                           \
2619             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2620         }                                                                  \
2621         } STMT_END
2622
2623 /* This gets the next character from the input, folding it if not already
2624  * folded. */
2625 #define TRIE_READ_CHAR STMT_START {                                           \
2626     wordlen++;                                                                \
2627     if ( UTF ) {                                                              \
2628         /* if it is UTF then it is either already folded, or does not need    \
2629          * folding */                                                         \
2630         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2631     }                                                                         \
2632     else if (folder == PL_fold_latin1) {                                      \
2633         /* This folder implies Unicode rules, which in the range expressible  \
2634          *  by not UTF is the lower case, with the two exceptions, one of     \
2635          *  which should have been taken care of before calling this */       \
2636         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2637         uvc = toLOWER_L1(*uc);                                                \
2638         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2639         len = 1;                                                              \
2640     } else {                                                                  \
2641         /* raw data, will be folded later if needed */                        \
2642         uvc = (U32)*uc;                                                       \
2643         len = 1;                                                              \
2644     }                                                                         \
2645 } STMT_END
2646
2647
2648
2649 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2650     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2651         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2652         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2653         TRIE_LIST_LEN( state ) = ging;                          \
2654     }                                                           \
2655     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2656     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2657     TRIE_LIST_CUR( state )++;                                   \
2658 } STMT_END
2659
2660 #define TRIE_LIST_NEW(state) STMT_START {                       \
2661     Newx( trie->states[ state ].trans.list,                     \
2662         4, reg_trie_trans_le );                                 \
2663      TRIE_LIST_CUR( state ) = 1;                                \
2664      TRIE_LIST_LEN( state ) = 4;                                \
2665 } STMT_END
2666
2667 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2668     U16 dupe= trie->states[ state ].wordnum;                    \
2669     regnode * const noper_next = regnext( noper );              \
2670                                                                 \
2671     DEBUG_r({                                                   \
2672         /* store the word for dumping */                        \
2673         SV* tmp;                                                \
2674         if (OP(noper) != NOTHING)                               \
2675             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2676         else                                                    \
2677             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2678         av_push( trie_words, tmp );                             \
2679     });                                                         \
2680                                                                 \
2681     curword++;                                                  \
2682     trie->wordinfo[curword].prev   = 0;                         \
2683     trie->wordinfo[curword].len    = wordlen;                   \
2684     trie->wordinfo[curword].accept = state;                     \
2685                                                                 \
2686     if ( noper_next < tail ) {                                  \
2687         if (!trie->jump)                                        \
2688             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2689                                                  sizeof(U16) ); \
2690         trie->jump[curword] = (U16)(noper_next - convert);      \
2691         if (!jumper)                                            \
2692             jumper = noper_next;                                \
2693         if (!nextbranch)                                        \
2694             nextbranch= regnext(cur);                           \
2695     }                                                           \
2696                                                                 \
2697     if ( dupe ) {                                               \
2698         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2699         /* chain, so that when the bits of chain are later    */\
2700         /* linked together, the dups appear in the chain      */\
2701         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2702         trie->wordinfo[dupe].prev = curword;                    \
2703     } else {                                                    \
2704         /* we haven't inserted this word yet.                */ \
2705         trie->states[ state ].wordnum = curword;                \
2706     }                                                           \
2707 } STMT_END
2708
2709
2710 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2711      ( ( base + charid >=  ucharcount                                   \
2712          && base + charid < ubound                                      \
2713          && state == trie->trans[ base - ucharcount + charid ].check    \
2714          && trie->trans[ base - ucharcount + charid ].next )            \
2715            ? trie->trans[ base - ucharcount + charid ].next             \
2716            : ( state==1 ? special : 0 )                                 \
2717       )
2718
2719 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2720 STMT_START {                                                \
2721     TRIE_BITMAP_SET(trie, uvc);                             \
2722     /* store the folded codepoint */                        \
2723     if ( folder )                                           \
2724         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2725                                                             \
2726     if ( !UTF ) {                                           \
2727         /* store first byte of utf8 representation of */    \
2728         /* variant codepoints */                            \
2729         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2730             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2731         }                                                   \
2732     }                                                       \
2733 } STMT_END
2734 #define MADE_TRIE       1
2735 #define MADE_JUMP_TRIE  2
2736 #define MADE_EXACT_TRIE 4
2737
2738 STATIC I32
2739 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2740                   regnode *first, regnode *last, regnode *tail,
2741                   U32 word_count, U32 flags, U32 depth)
2742 {
2743     /* first pass, loop through and scan words */
2744     reg_trie_data *trie;
2745     HV *widecharmap = NULL;
2746     AV *revcharmap = newAV();
2747     regnode *cur;
2748     STRLEN len = 0;
2749     UV uvc = 0;
2750     U16 curword = 0;
2751     U32 next_alloc = 0;
2752     regnode *jumper = NULL;
2753     regnode *nextbranch = NULL;
2754     regnode *convert = NULL;
2755     U32 *prev_states; /* temp array mapping each state to previous one */
2756     /* we just use folder as a flag in utf8 */
2757     const U8 * folder = NULL;
2758
2759     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2760      * which stands for one trie structure, one hash, optionally followed
2761      * by two arrays */
2762 #ifdef DEBUGGING
2763     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2764     AV *trie_words = NULL;
2765     /* along with revcharmap, this only used during construction but both are
2766      * useful during debugging so we store them in the struct when debugging.
2767      */
2768 #else
2769     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2770     STRLEN trie_charcount=0;
2771 #endif
2772     SV *re_trie_maxbuff;
2773     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2774
2775     PERL_ARGS_ASSERT_MAKE_TRIE;
2776 #ifndef DEBUGGING
2777     PERL_UNUSED_ARG(depth);
2778 #endif
2779
2780     switch (flags) {
2781         case EXACT: case EXACT_REQ8: case EXACTL: break;
2782         case EXACTFAA:
2783         case EXACTFUP:
2784         case EXACTFU:
2785         case EXACTFLU8: folder = PL_fold_latin1; break;
2786         case EXACTF:  folder = PL_fold; break;
2787         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2788     }
2789
2790     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2791     trie->refcount = 1;
2792     trie->startstate = 1;
2793     trie->wordcount = word_count;
2794     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2795     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2796     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2797         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2798     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2799                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2800
2801     DEBUG_r({
2802         trie_words = newAV();
2803     });
2804
2805     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2806     assert(re_trie_maxbuff);
2807     if (!SvIOK(re_trie_maxbuff)) {
2808         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2809     }
2810     DEBUG_TRIE_COMPILE_r({
2811         Perl_re_indentf( aTHX_
2812           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2813           depth+1,
2814           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2815           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2816     });
2817
2818    /* Find the node we are going to overwrite */
2819     if ( first == startbranch && OP( last ) != BRANCH ) {
2820         /* whole branch chain */
2821         convert = first;
2822     } else {
2823         /* branch sub-chain */
2824         convert = NEXTOPER( first );
2825     }
2826
2827     /*  -- First loop and Setup --
2828
2829        We first traverse the branches and scan each word to determine if it
2830        contains widechars, and how many unique chars there are, this is
2831        important as we have to build a table with at least as many columns as we
2832        have unique chars.
2833
2834        We use an array of integers to represent the character codes 0..255
2835        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2836        the native representation of the character value as the key and IV's for
2837        the coded index.
2838
2839        *TODO* If we keep track of how many times each character is used we can
2840        remap the columns so that the table compression later on is more
2841        efficient in terms of memory by ensuring the most common value is in the
2842        middle and the least common are on the outside.  IMO this would be better
2843        than a most to least common mapping as theres a decent chance the most
2844        common letter will share a node with the least common, meaning the node
2845        will not be compressible. With a middle is most common approach the worst
2846        case is when we have the least common nodes twice.
2847
2848      */
2849
2850     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2851         regnode *noper = NEXTOPER( cur );
2852         const U8 *uc;
2853         const U8 *e;
2854         int foldlen = 0;
2855         U32 wordlen      = 0;         /* required init */
2856         STRLEN minchars = 0;
2857         STRLEN maxchars = 0;
2858         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2859                                                bitmap?*/
2860
2861         if (OP(noper) == NOTHING) {
2862             /* skip past a NOTHING at the start of an alternation
2863              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2864              *
2865              * If the next node is not something we are supposed to process
2866              * we will just ignore it due to the condition guarding the
2867              * next block.
2868              */
2869
2870             regnode *noper_next= regnext(noper);
2871             if (noper_next < tail)
2872                 noper= noper_next;
2873         }
2874
2875         if (    noper < tail
2876             && (    OP(noper) == flags
2877                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2878                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
2879                                          || OP(noper) == EXACTFUP))))
2880         {
2881             uc= (U8*)STRING(noper);
2882             e= uc + STR_LEN(noper);
2883         } else {
2884             trie->minlen= 0;
2885             continue;
2886         }
2887
2888
2889         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2890             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2891                                           regardless of encoding */
2892             if (OP( noper ) == EXACTFUP) {
2893                 /* false positives are ok, so just set this */
2894                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2895             }
2896         }
2897
2898         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2899                                            branch */
2900             TRIE_CHARCOUNT(trie)++;
2901             TRIE_READ_CHAR;
2902
2903             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2904              * is in effect.  Under /i, this character can match itself, or
2905              * anything that folds to it.  If not under /i, it can match just
2906              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2907              * all fold to k, and all are single characters.   But some folds
2908              * expand to more than one character, so for example LATIN SMALL
2909              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2910              * the string beginning at 'uc' is 'ffi', it could be matched by
2911              * three characters, or just by the one ligature character. (It
2912              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2913              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2914              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2915              * match.)  The trie needs to know the minimum and maximum number
2916              * of characters that could match so that it can use size alone to
2917              * quickly reject many match attempts.  The max is simple: it is
2918              * the number of folded characters in this branch (since a fold is
2919              * never shorter than what folds to it. */
2920
2921             maxchars++;
2922
2923             /* And the min is equal to the max if not under /i (indicated by
2924              * 'folder' being NULL), or there are no multi-character folds.  If
2925              * there is a multi-character fold, the min is incremented just
2926              * once, for the character that folds to the sequence.  Each
2927              * character in the sequence needs to be added to the list below of
2928              * characters in the trie, but we count only the first towards the
2929              * min number of characters needed.  This is done through the
2930              * variable 'foldlen', which is returned by the macros that look
2931              * for these sequences as the number of bytes the sequence
2932              * occupies.  Each time through the loop, we decrement 'foldlen' by
2933              * how many bytes the current char occupies.  Only when it reaches
2934              * 0 do we increment 'minchars' or look for another multi-character
2935              * sequence. */
2936             if (folder == NULL) {
2937                 minchars++;
2938             }
2939             else if (foldlen > 0) {
2940                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2941             }
2942             else {
2943                 minchars++;
2944
2945                 /* See if *uc is the beginning of a multi-character fold.  If
2946                  * so, we decrement the length remaining to look at, to account
2947                  * for the current character this iteration.  (We can use 'uc'
2948                  * instead of the fold returned by TRIE_READ_CHAR because for
2949                  * non-UTF, the latin1_safe macro is smart enough to account
2950                  * for all the unfolded characters, and because for UTF, the
2951                  * string will already have been folded earlier in the
2952                  * compilation process */
2953                 if (UTF) {
2954                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2955                         foldlen -= UTF8SKIP(uc);
2956                     }
2957                 }
2958                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2959                     foldlen--;
2960                 }
2961             }
2962
2963             /* The current character (and any potential folds) should be added
2964              * to the possible matching characters for this position in this
2965              * branch */
2966             if ( uvc < 256 ) {
2967                 if ( folder ) {
2968                     U8 folded= folder[ (U8) uvc ];
2969                     if ( !trie->charmap[ folded ] ) {
2970                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2971                         TRIE_STORE_REVCHAR( folded );
2972                     }
2973                 }
2974                 if ( !trie->charmap[ uvc ] ) {
2975                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2976                     TRIE_STORE_REVCHAR( uvc );
2977                 }
2978                 if ( set_bit ) {
2979                     /* store the codepoint in the bitmap, and its folded
2980                      * equivalent. */
2981                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2982                     set_bit = 0; /* We've done our bit :-) */
2983                 }
2984             } else {
2985
2986                 /* XXX We could come up with the list of code points that fold
2987                  * to this using PL_utf8_foldclosures, except not for
2988                  * multi-char folds, as there may be multiple combinations
2989                  * there that could work, which needs to wait until runtime to
2990                  * resolve (The comment about LIGATURE FFI above is such an
2991                  * example */
2992
2993                 SV** svpp;
2994                 if ( !widecharmap )
2995                     widecharmap = newHV();
2996
2997                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2998
2999                 if ( !svpp )
3000                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
3001
3002                 if ( !SvTRUE( *svpp ) ) {
3003                     sv_setiv( *svpp, ++trie->uniquecharcount );
3004                     TRIE_STORE_REVCHAR(uvc);
3005                 }
3006             }
3007         } /* end loop through characters in this branch of the trie */
3008
3009         /* We take the min and max for this branch and combine to find the min
3010          * and max for all branches processed so far */
3011         if( cur == first ) {
3012             trie->minlen = minchars;
3013             trie->maxlen = maxchars;
3014         } else if (minchars < trie->minlen) {
3015             trie->minlen = minchars;
3016         } else if (maxchars > trie->maxlen) {
3017             trie->maxlen = maxchars;
3018         }
3019     } /* end first pass */
3020     DEBUG_TRIE_COMPILE_r(
3021         Perl_re_indentf( aTHX_
3022                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3023                 depth+1,
3024                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3025                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3026                 (int)trie->minlen, (int)trie->maxlen )
3027     );
3028
3029     /*
3030         We now know what we are dealing with in terms of unique chars and
3031         string sizes so we can calculate how much memory a naive
3032         representation using a flat table  will take. If it's over a reasonable
3033         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3034         conservative but potentially much slower representation using an array
3035         of lists.
3036
3037         At the end we convert both representations into the same compressed
3038         form that will be used in regexec.c for matching with. The latter
3039         is a form that cannot be used to construct with but has memory
3040         properties similar to the list form and access properties similar
3041         to the table form making it both suitable for fast searches and
3042         small enough that its feasable to store for the duration of a program.
3043
3044         See the comment in the code where the compressed table is produced
3045         inplace from the flat tabe representation for an explanation of how
3046         the compression works.
3047
3048     */
3049
3050
3051     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3052     prev_states[1] = 0;
3053
3054     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3055                                                     > SvIV(re_trie_maxbuff) )
3056     {
3057         /*
3058             Second Pass -- Array Of Lists Representation
3059
3060             Each state will be represented by a list of charid:state records
3061             (reg_trie_trans_le) the first such element holds the CUR and LEN
3062             points of the allocated array. (See defines above).
3063
3064             We build the initial structure using the lists, and then convert
3065             it into the compressed table form which allows faster lookups
3066             (but cant be modified once converted).
3067         */
3068
3069         STRLEN transcount = 1;
3070
3071         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
3072             depth+1));
3073
3074         trie->states = (reg_trie_state *)
3075             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3076                                   sizeof(reg_trie_state) );
3077         TRIE_LIST_NEW(1);
3078         next_alloc = 2;
3079
3080         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3081
3082             regnode *noper   = NEXTOPER( cur );
3083             U32 state        = 1;         /* required init */
3084             U16 charid       = 0;         /* sanity init */
3085             U32 wordlen      = 0;         /* required init */
3086
3087             if (OP(noper) == NOTHING) {
3088                 regnode *noper_next= regnext(noper);
3089                 if (noper_next < tail)
3090                     noper= noper_next;
3091                 /* we will undo this assignment if noper does not
3092                  * point at a trieable type in the else clause of
3093                  * the following statement. */
3094             }
3095
3096             if (    noper < tail
3097                 && (    OP(noper) == flags
3098                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3099                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3100                                              || OP(noper) == EXACTFUP))))
3101             {
3102                 const U8 *uc= (U8*)STRING(noper);
3103                 const U8 *e= uc + STR_LEN(noper);
3104
3105                 for ( ; uc < e ; uc += len ) {
3106
3107                     TRIE_READ_CHAR;
3108
3109                     if ( uvc < 256 ) {
3110                         charid = trie->charmap[ uvc ];
3111                     } else {
3112                         SV** const svpp = hv_fetch( widecharmap,
3113                                                     (char*)&uvc,
3114                                                     sizeof( UV ),
3115                                                     0);
3116                         if ( !svpp ) {
3117                             charid = 0;
3118                         } else {
3119                             charid=(U16)SvIV( *svpp );
3120                         }
3121                     }
3122                     /* charid is now 0 if we dont know the char read, or
3123                      * nonzero if we do */
3124                     if ( charid ) {
3125
3126                         U16 check;
3127                         U32 newstate = 0;
3128
3129                         charid--;
3130                         if ( !trie->states[ state ].trans.list ) {
3131                             TRIE_LIST_NEW( state );
3132                         }
3133                         for ( check = 1;
3134                               check <= TRIE_LIST_USED( state );
3135                               check++ )
3136                         {
3137                             if ( TRIE_LIST_ITEM( state, check ).forid
3138                                                                     == charid )
3139                             {
3140                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3141                                 break;
3142                             }
3143                         }
3144                         if ( ! newstate ) {
3145                             newstate = next_alloc++;
3146                             prev_states[newstate] = state;
3147                             TRIE_LIST_PUSH( state, charid, newstate );
3148                             transcount++;
3149                         }
3150                         state = newstate;
3151                     } else {
3152                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3153                     }
3154                 }
3155             } else {
3156                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3157                  * on a trieable type. So we need to reset noper back to point at the first regop
3158                  * in the branch before we call TRIE_HANDLE_WORD()
3159                 */
3160                 noper= NEXTOPER(cur);
3161             }
3162             TRIE_HANDLE_WORD(state);
3163
3164         } /* end second pass */
3165
3166         /* next alloc is the NEXT state to be allocated */
3167         trie->statecount = next_alloc;
3168         trie->states = (reg_trie_state *)
3169             PerlMemShared_realloc( trie->states,
3170                                    next_alloc
3171                                    * sizeof(reg_trie_state) );
3172
3173         /* and now dump it out before we compress it */
3174         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3175                                                          revcharmap, next_alloc,
3176                                                          depth+1)
3177         );
3178
3179         trie->trans = (reg_trie_trans *)
3180             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3181         {
3182             U32 state;
3183             U32 tp = 0;
3184             U32 zp = 0;
3185
3186
3187             for( state=1 ; state < next_alloc ; state ++ ) {
3188                 U32 base=0;
3189
3190                 /*
3191                 DEBUG_TRIE_COMPILE_MORE_r(
3192                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3193                 );
3194                 */
3195
3196                 if (trie->states[state].trans.list) {
3197                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3198                     U16 maxid=minid;
3199                     U16 idx;
3200
3201                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3202                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3203                         if ( forid < minid ) {
3204                             minid=forid;
3205                         } else if ( forid > maxid ) {
3206                             maxid=forid;
3207                         }
3208                     }
3209                     if ( transcount < tp + maxid - minid + 1) {
3210                         transcount *= 2;
3211                         trie->trans = (reg_trie_trans *)
3212                             PerlMemShared_realloc( trie->trans,
3213                                                      transcount
3214                                                      * sizeof(reg_trie_trans) );
3215                         Zero( trie->trans + (transcount / 2),
3216                               transcount / 2,
3217                               reg_trie_trans );
3218                     }
3219                     base = trie->uniquecharcount + tp - minid;
3220                     if ( maxid == minid ) {
3221                         U32 set = 0;
3222                         for ( ; zp < tp ; zp++ ) {
3223                             if ( ! trie->trans[ zp ].next ) {
3224                                 base = trie->uniquecharcount + zp - minid;
3225                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3226                                                                    1).newstate;
3227                                 trie->trans[ zp ].check = state;
3228                                 set = 1;
3229                                 break;
3230                             }
3231                         }
3232                         if ( !set ) {
3233                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3234                                                                    1).newstate;
3235                             trie->trans[ tp ].check = state;
3236                             tp++;
3237                             zp = tp;
3238                         }
3239                     } else {
3240                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3241                             const U32 tid = base
3242                                            - trie->uniquecharcount
3243                                            + TRIE_LIST_ITEM( state, idx ).forid;
3244                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3245                                                                 idx ).newstate;
3246                             trie->trans[ tid ].check = state;
3247                         }
3248                         tp += ( maxid - minid + 1 );
3249                     }
3250                     Safefree(trie->states[ state ].trans.list);
3251                 }
3252                 /*
3253                 DEBUG_TRIE_COMPILE_MORE_r(
3254                     Perl_re_printf( aTHX_  " base: %d\n",base);
3255                 );
3256                 */
3257                 trie->states[ state ].trans.base=base;
3258             }
3259             trie->lasttrans = tp + 1;
3260         }
3261     } else {
3262         /*
3263            Second Pass -- Flat Table Representation.
3264
3265            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3266            each.  We know that we will need Charcount+1 trans at most to store
3267            the data (one row per char at worst case) So we preallocate both
3268            structures assuming worst case.
3269
3270            We then construct the trie using only the .next slots of the entry
3271            structs.
3272
3273            We use the .check field of the first entry of the node temporarily
3274            to make compression both faster and easier by keeping track of how
3275            many non zero fields are in the node.
3276
3277            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3278            transition.
3279
3280            There are two terms at use here: state as a TRIE_NODEIDX() which is
3281            a number representing the first entry of the node, and state as a
3282            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3283            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3284            if there are 2 entrys per node. eg:
3285
3286              A B       A B
3287           1. 2 4    1. 3 7
3288           2. 0 3    3. 0 5
3289           3. 0 0    5. 0 0
3290           4. 0 0    7. 0 0
3291
3292            The table is internally in the right hand, idx form. However as we
3293            also have to deal with the states array which is indexed by nodenum
3294            we have to use TRIE_NODENUM() to convert.
3295
3296         */
3297         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3298             depth+1));
3299
3300         trie->trans = (reg_trie_trans *)
3301             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3302                                   * trie->uniquecharcount + 1,
3303                                   sizeof(reg_trie_trans) );
3304         trie->states = (reg_trie_state *)
3305             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3306                                   sizeof(reg_trie_state) );
3307         next_alloc = trie->uniquecharcount + 1;
3308
3309
3310         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3311
3312             regnode *noper   = NEXTOPER( cur );
3313
3314             U32 state        = 1;         /* required init */
3315
3316             U16 charid       = 0;         /* sanity init */
3317             U32 accept_state = 0;         /* sanity init */
3318
3319             U32 wordlen      = 0;         /* required init */
3320
3321             if (OP(noper) == NOTHING) {
3322                 regnode *noper_next= regnext(noper);
3323                 if (noper_next < tail)
3324                     noper= noper_next;
3325                 /* we will undo this assignment if noper does not
3326                  * point at a trieable type in the else clause of
3327                  * the following statement. */
3328             }
3329
3330             if (    noper < tail
3331                 && (    OP(noper) == flags
3332                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3333                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3334                                              || OP(noper) == EXACTFUP))))
3335             {
3336                 const U8 *uc= (U8*)STRING(noper);
3337                 const U8 *e= uc + STR_LEN(noper);
3338
3339                 for ( ; uc < e ; uc += len ) {
3340
3341                     TRIE_READ_CHAR;
3342
3343                     if ( uvc < 256 ) {
3344                         charid = trie->charmap[ uvc ];
3345                     } else {
3346                         SV* const * const svpp = hv_fetch( widecharmap,
3347                                                            (char*)&uvc,
3348                                                            sizeof( UV ),
3349                                                            0);
3350                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3351                     }
3352                     if ( charid ) {
3353                         charid--;
3354                         if ( !trie->trans[ state + charid ].next ) {
3355                             trie->trans[ state + charid ].next = next_alloc;
3356                             trie->trans[ state ].check++;
3357                             prev_states[TRIE_NODENUM(next_alloc)]
3358                                     = TRIE_NODENUM(state);
3359                             next_alloc += trie->uniquecharcount;
3360                         }
3361                         state = trie->trans[ state + charid ].next;
3362                     } else {
3363                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3364                     }
3365                     /* charid is now 0 if we dont know the char read, or
3366                      * nonzero if we do */
3367                 }
3368             } else {
3369                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3370                  * on a trieable type. So we need to reset noper back to point at the first regop
3371                  * in the branch before we call TRIE_HANDLE_WORD().
3372                 */
3373                 noper= NEXTOPER(cur);
3374             }
3375             accept_state = TRIE_NODENUM( state );
3376             TRIE_HANDLE_WORD(accept_state);
3377
3378         } /* end second pass */
3379
3380         /* and now dump it out before we compress it */
3381         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3382                                                           revcharmap,
3383                                                           next_alloc, depth+1));
3384
3385         {
3386         /*
3387            * Inplace compress the table.*
3388
3389            For sparse data sets the table constructed by the trie algorithm will
3390            be mostly 0/FAIL transitions or to put it another way mostly empty.
3391            (Note that leaf nodes will not contain any transitions.)
3392
3393            This algorithm compresses the tables by eliminating most such
3394            transitions, at the cost of a modest bit of extra work during lookup:
3395
3396            - Each states[] entry contains a .base field which indicates the
3397            index in the state[] array wheres its transition data is stored.
3398
3399            - If .base is 0 there are no valid transitions from that node.
3400
3401            - If .base is nonzero then charid is added to it to find an entry in
3402            the trans array.
3403
3404            -If trans[states[state].base+charid].check!=state then the
3405            transition is taken to be a 0/Fail transition. Thus if there are fail
3406            transitions at the front of the node then the .base offset will point
3407            somewhere inside the previous nodes data (or maybe even into a node
3408            even earlier), but the .check field determines if the transition is
3409            valid.
3410
3411            XXX - wrong maybe?
3412            The following process inplace converts the table to the compressed
3413            table: We first do not compress the root node 1,and mark all its
3414            .check pointers as 1 and set its .base pointer as 1 as well. This
3415            allows us to do a DFA construction from the compressed table later,
3416            and ensures that any .base pointers we calculate later are greater
3417            than 0.
3418
3419            - We set 'pos' to indicate the first entry of the second node.
3420
3421            - We then iterate over the columns of the node, finding the first and
3422            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3423            and set the .check pointers accordingly, and advance pos
3424            appropriately and repreat for the next node. Note that when we copy
3425            the next pointers we have to convert them from the original
3426            NODEIDX form to NODENUM form as the former is not valid post
3427            compression.
3428
3429            - If a node has no transitions used we mark its base as 0 and do not
3430            advance the pos pointer.
3431
3432            - If a node only has one transition we use a second pointer into the
3433            structure to fill in allocated fail transitions from other states.
3434            This pointer is independent of the main pointer and scans forward
3435            looking for null transitions that are allocated to a state. When it
3436            finds one it writes the single transition into the "hole".  If the
3437            pointer doesnt find one the single transition is appended as normal.
3438
3439            - Once compressed we can Renew/realloc the structures to release the
3440            excess space.
3441
3442            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3443            specifically Fig 3.47 and the associated pseudocode.
3444
3445            demq
3446         */
3447         const U32 laststate = TRIE_NODENUM( next_alloc );
3448         U32 state, charid;
3449         U32 pos = 0, zp=0;
3450         trie->statecount = laststate;
3451
3452         for ( state = 1 ; state < laststate ; state++ ) {
3453             U8 flag = 0;
3454             const U32 stateidx = TRIE_NODEIDX( state );
3455             const U32 o_used = trie->trans[ stateidx ].check;
3456             U32 used = trie->trans[ stateidx ].check;
3457             trie->trans[ stateidx ].check = 0;
3458
3459             for ( charid = 0;
3460                   used && charid < trie->uniquecharcount;
3461                   charid++ )
3462             {
3463                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3464                     if ( trie->trans[ stateidx + charid ].next ) {
3465                         if (o_used == 1) {
3466                             for ( ; zp < pos ; zp++ ) {
3467                                 if ( ! trie->trans[ zp ].next ) {
3468                                     break;
3469                                 }
3470                             }
3471                             trie->states[ state ].trans.base
3472                                                     = zp
3473                                                       + trie->uniquecharcount
3474                                                       - charid ;
3475                             trie->trans[ zp ].next
3476                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3477                                                              + charid ].next );
3478                             trie->trans[ zp ].check = state;
3479                             if ( ++zp > pos ) pos = zp;
3480                             break;
3481                         }
3482                         used--;
3483                     }
3484                     if ( !flag ) {
3485                         flag = 1;
3486                         trie->states[ state ].trans.base
3487                                        = pos + trie->uniquecharcount - charid ;
3488                     }
3489                     trie->trans[ pos ].next
3490                         = SAFE_TRIE_NODENUM(
3491                                        trie->trans[ stateidx + charid ].next );
3492                     trie->trans[ pos ].check = state;
3493                     pos++;
3494                 }
3495             }
3496         }
3497         trie->lasttrans = pos + 1;
3498         trie->states = (reg_trie_state *)
3499             PerlMemShared_realloc( trie->states, laststate
3500                                    * sizeof(reg_trie_state) );
3501         DEBUG_TRIE_COMPILE_MORE_r(
3502             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3503                 depth+1,
3504                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3505                        + 1 ),
3506                 (IV)next_alloc,
3507                 (IV)pos,
3508                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3509             );
3510
3511         } /* end table compress */
3512     }
3513     DEBUG_TRIE_COMPILE_MORE_r(
3514             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3515                 depth+1,
3516                 (UV)trie->statecount,
3517                 (UV)trie->lasttrans)
3518     );
3519     /* resize the trans array to remove unused space */
3520     trie->trans = (reg_trie_trans *)
3521         PerlMemShared_realloc( trie->trans, trie->lasttrans
3522                                * sizeof(reg_trie_trans) );
3523
3524     {   /* Modify the program and insert the new TRIE node */
3525         U8 nodetype =(U8)(flags & 0xFF);
3526         char *str=NULL;
3527
3528 #ifdef DEBUGGING
3529         regnode *optimize = NULL;
3530 #ifdef RE_TRACK_PATTERN_OFFSETS
3531
3532         U32 mjd_offset = 0;
3533         U32 mjd_nodelen = 0;
3534 #endif /* RE_TRACK_PATTERN_OFFSETS */
3535 #endif /* DEBUGGING */
3536         /*
3537            This means we convert either the first branch or the first Exact,
3538            depending on whether the thing following (in 'last') is a branch
3539            or not and whther first is the startbranch (ie is it a sub part of
3540            the alternation or is it the whole thing.)
3541            Assuming its a sub part we convert the EXACT otherwise we convert
3542            the whole branch sequence, including the first.
3543          */
3544         /* Find the node we are going to overwrite */
3545         if ( first != startbranch || OP( last ) == BRANCH ) {
3546             /* branch sub-chain */
3547             NEXT_OFF( first ) = (U16)(last - first);
3548 #ifdef RE_TRACK_PATTERN_OFFSETS
3549             DEBUG_r({
3550                 mjd_offset= Node_Offset((convert));
3551                 mjd_nodelen= Node_Length((convert));
3552             });
3553 #endif
3554             /* whole branch chain */
3555         }
3556 #ifdef RE_TRACK_PATTERN_OFFSETS
3557         else {
3558             DEBUG_r({
3559                 const  regnode *nop = NEXTOPER( convert );
3560                 mjd_offset= Node_Offset((nop));
3561                 mjd_nodelen= Node_Length((nop));
3562             });
3563         }
3564         DEBUG_OPTIMISE_r(
3565             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3566                 depth+1,
3567                 (UV)mjd_offset, (UV)mjd_nodelen)
3568         );
3569 #endif
3570         /* But first we check to see if there is a common prefix we can
3571            split out as an EXACT and put in front of the TRIE node.  */
3572         trie->startstate= 1;
3573         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3574             /* we want to find the first state that has more than
3575              * one transition, if that state is not the first state
3576              * then we have a common prefix which we can remove.
3577              */
3578             U32 state;
3579             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3580                 U32 ofs = 0;
3581                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3582                                        transition, -1 means none */
3583                 U32 count = 0;
3584                 const U32 base = trie->states[ state ].trans.base;
3585
3586                 /* does this state terminate an alternation? */
3587                 if ( trie->states[state].wordnum )
3588                         count = 1;
3589
3590                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3591                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3592                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3593                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3594                     {
3595                         if ( ++count > 1 ) {
3596                             /* we have more than one transition */
3597                             SV **tmp;
3598                             U8 *ch;
3599                             /* if this is the first state there is no common prefix
3600                              * to extract, so we can exit */
3601                             if ( state == 1 ) break;
3602                             tmp = av_fetch( revcharmap, ofs, 0);
3603                             ch = (U8*)SvPV_nolen_const( *tmp );
3604
3605                             /* if we are on count 2 then we need to initialize the
3606                              * bitmap, and store the previous char if there was one
3607                              * in it*/
3608                             if ( count == 2 ) {
3609                                 /* clear the bitmap */
3610                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3611                                 DEBUG_OPTIMISE_r(
3612                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3613                                         depth+1,
3614                                         (UV)state));
3615                                 if (first_ofs >= 0) {
3616                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3617                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3618
3619                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3620                                     DEBUG_OPTIMISE_r(
3621                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3622                                     );
3623                                 }
3624                             }
3625                             /* store the current firstchar in the bitmap */
3626                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3627                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3628                         }
3629                         first_ofs = ofs;
3630                     }
3631                 }
3632                 if ( count == 1 ) {
3633                     /* This state has only one transition, its transition is part
3634                      * of a common prefix - we need to concatenate the char it
3635                      * represents to what we have so far. */
3636                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3637                     STRLEN len;
3638                     char *ch = SvPV( *tmp, len );
3639                     DEBUG_OPTIMISE_r({
3640                         SV *sv=sv_newmortal();
3641                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3642                             depth+1,
3643                             (UV)state, (UV)first_ofs,
3644                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3645                                 PL_colors[0], PL_colors[1],
3646                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3647                                 PERL_PV_ESCAPE_FIRSTCHAR
3648                             )
3649                         );
3650                     });
3651                     if ( state==1 ) {
3652                         OP( convert ) = nodetype;
3653                         str=STRING(convert);
3654                         setSTR_LEN(convert, 0);
3655                     }
3656                     assert( ( STR_LEN(convert) + len ) < 256 );
3657                     setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3658                     while (len--)
3659                         *str++ = *ch++;
3660                 } else {
3661 #ifdef DEBUGGING
3662                     if (state>1)
3663                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3664 #endif
3665                     break;
3666                 }
3667             }
3668             trie->prefixlen = (state-1);
3669             if (str) {
3670                 regnode *n = convert+NODE_SZ_STR(convert);
3671                 assert( NODE_SZ_STR(convert) <= U16_MAX );
3672                 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3673                 trie->startstate = state;
3674                 trie->minlen -= (state - 1);
3675                 trie->maxlen -= (state - 1);
3676 #ifdef DEBUGGING
3677                /* At least the UNICOS C compiler choked on this
3678                 * being argument to DEBUG_r(), so let's just have
3679                 * it right here. */
3680                if (
3681 #ifdef PERL_EXT_RE_BUILD
3682                    1
3683 #else
3684                    DEBUG_r_TEST
3685 #endif
3686                    ) {
3687                    regnode *fix = convert;
3688                    U32 word = trie->wordcount;
3689 #ifdef RE_TRACK_PATTERN_OFFSETS
3690                    mjd_nodelen++;
3691 #endif
3692                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3693                    while( ++fix < n ) {
3694                        Set_Node_Offset_Length(fix, 0, 0);
3695                    }
3696                    while (word--) {
3697                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3698                        if (tmp) {
3699                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3700                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3701                            else
3702                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3703                        }
3704                    }
3705                }
3706 #endif
3707                 if (trie->maxlen) {
3708                     convert = n;
3709                 } else {
3710                     NEXT_OFF(convert) = (U16)(tail - convert);
3711                     DEBUG_r(optimize= n);
3712                 }
3713             }
3714         }
3715         if (!jumper)
3716             jumper = last;
3717         if ( trie->maxlen ) {
3718             NEXT_OFF( convert ) = (U16)(tail - convert);
3719             ARG_SET( convert, data_slot );
3720             /* Store the offset to the first unabsorbed branch in
3721                jump[0], which is otherwise unused by the jump logic.
3722                We use this when dumping a trie and during optimisation. */
3723             if (trie->jump)
3724                 trie->jump[0] = (U16)(nextbranch - convert);
3725
3726             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3727              *   and there is a bitmap
3728              *   and the first "jump target" node we found leaves enough room
3729              * then convert the TRIE node into a TRIEC node, with the bitmap
3730              * embedded inline in the opcode - this is hypothetically faster.
3731              */
3732             if ( !trie->states[trie->startstate].wordnum
3733                  && trie->bitmap
3734                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3735             {
3736                 OP( convert ) = TRIEC;
3737                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3738                 PerlMemShared_free(trie->bitmap);
3739                 trie->bitmap= NULL;
3740             } else
3741                 OP( convert ) = TRIE;
3742
3743             /* store the type in the flags */
3744             convert->flags = nodetype;
3745             DEBUG_r({
3746             optimize = convert
3747                       + NODE_STEP_REGNODE
3748                       + regarglen[ OP( convert ) ];
3749             });
3750             /* XXX We really should free up the resource in trie now,
3751                    as we won't use them - (which resources?) dmq */
3752         }
3753         /* needed for dumping*/
3754         DEBUG_r(if (optimize) {
3755             regnode *opt = convert;
3756
3757             while ( ++opt < optimize) {
3758                 Set_Node_Offset_Length(opt, 0, 0);
3759             }
3760             /*
3761                 Try to clean up some of the debris left after the
3762                 optimisation.
3763              */
3764             while( optimize < jumper ) {
3765                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3766                 OP( optimize ) = OPTIMIZED;
3767                 Set_Node_Offset_Length(optimize, 0, 0);
3768                 optimize++;
3769             }
3770             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3771         });
3772     } /* end node insert */
3773
3774     /*  Finish populating the prev field of the wordinfo array.  Walk back
3775      *  from each accept state until we find another accept state, and if
3776      *  so, point the first word's .prev field at the second word. If the
3777      *  second already has a .prev field set, stop now. This will be the
3778      *  case either if we've already processed that word's accept state,
3779      *  or that state had multiple words, and the overspill words were
3780      *  already linked up earlier.
3781      */
3782     {
3783         U16 word;
3784         U32 state;
3785         U16 prev;
3786
3787         for (word=1; word <= trie->wordcount; word++) {
3788             prev = 0;
3789             if (trie->wordinfo[word].prev)
3790                 continue;
3791             state = trie->wordinfo[word].accept;
3792             while (state) {
3793                 state = prev_states[state];
3794                 if (!state)
3795                     break;
3796                 prev = trie->states[state].wordnum;
3797                 if (prev)
3798                     break;
3799             }
3800             trie->wordinfo[word].prev = prev;
3801         }
3802         Safefree(prev_states);
3803     }
3804
3805
3806     /* and now dump out the compressed format */
3807     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3808
3809     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3810 #ifdef DEBUGGING
3811     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3812     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3813 #else
3814     SvREFCNT_dec_NN(revcharmap);
3815 #endif
3816     return trie->jump
3817            ? MADE_JUMP_TRIE
3818            : trie->startstate>1
3819              ? MADE_EXACT_TRIE
3820              : MADE_TRIE;
3821 }
3822
3823 STATIC regnode *
3824 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3825 {
3826 /* The Trie is constructed and compressed now so we can build a fail array if
3827  * it's needed
3828
3829    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3830    3.32 in the
3831    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3832    Ullman 1985/88
3833    ISBN 0-201-10088-6
3834
3835    We find the fail state for each state in the trie, this state is the longest
3836    proper suffix of the current state's 'word' that is also a proper prefix of
3837    another word in our trie. State 1 represents the word '' and is thus the
3838    default fail state. This allows the DFA not to have to restart after its
3839    tried and failed a word at a given point, it simply continues as though it
3840    had been matching the other word in the first place.
3841    Consider
3842       'abcdgu'=~/abcdefg|cdgu/
3843    When we get to 'd' we are still matching the first word, we would encounter
3844    'g' which would fail, which would bring us to the state representing 'd' in
3845    the second word where we would try 'g' and succeed, proceeding to match
3846    'cdgu'.
3847  */
3848  /* add a fail transition */
3849     const U32 trie_offset = ARG(source);
3850     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3851     U32 *q;
3852     const U32 ucharcount = trie->uniquecharcount;
3853     const U32 numstates = trie->statecount;
3854     const U32 ubound = trie->lasttrans + ucharcount;
3855     U32 q_read = 0;
3856     U32 q_write = 0;
3857     U32 charid;
3858     U32 base = trie->states[ 1 ].trans.base;
3859     U32 *fail;
3860     reg_ac_data *aho;
3861     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3862     regnode *stclass;
3863     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3864
3865     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3866     PERL_UNUSED_CONTEXT;
3867 #ifndef DEBUGGING
3868     PERL_UNUSED_ARG(depth);
3869 #endif
3870
3871     if ( OP(source) == TRIE ) {
3872         struct regnode_1 *op = (struct regnode_1 *)
3873             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3874         StructCopy(source, op, struct regnode_1);
3875         stclass = (regnode *)op;
3876     } else {
3877         struct regnode_charclass *op = (struct regnode_charclass *)
3878             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3879         StructCopy(source, op, struct regnode_charclass);
3880         stclass = (regnode *)op;
3881     }
3882     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3883
3884     ARG_SET( stclass, data_slot );
3885     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3886     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3887     aho->trie=trie_offset;
3888     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3889     Copy( trie->states, aho->states, numstates, reg_trie_state );
3890     Newx( q, numstates, U32);
3891     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3892     aho->refcount = 1;
3893     fail = aho->fail;
3894     /* initialize fail[0..1] to be 1 so that we always have
3895        a valid final fail state */
3896     fail[ 0 ] = fail[ 1 ] = 1;
3897
3898     for ( charid = 0; charid < ucharcount ; charid++ ) {
3899         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3900         if ( newstate ) {
3901             q[ q_write ] = newstate;
3902             /* set to point at the root */
3903             fail[ q[ q_write++ ] ]=1;
3904         }
3905     }
3906     while ( q_read < q_write) {
3907         const U32 cur = q[ q_read++ % numstates ];
3908         base = trie->states[ cur ].trans.base;
3909
3910         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3911             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3912             if (ch_state) {
3913                 U32 fail_state = cur;
3914                 U32 fail_base;
3915                 do {
3916                     fail_state = fail[ fail_state ];
3917                     fail_base = aho->states[ fail_state ].trans.base;
3918                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3919
3920                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3921                 fail[ ch_state ] = fail_state;
3922                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3923                 {
3924                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3925                 }
3926                 q[ q_write++ % numstates] = ch_state;
3927             }
3928         }
3929     }
3930     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3931        when we fail in state 1, this allows us to use the
3932        charclass scan to find a valid start char. This is based on the principle
3933        that theres a good chance the string being searched contains lots of stuff
3934        that cant be a start char.
3935      */
3936     fail[ 0 ] = fail[ 1 ] = 0;
3937     DEBUG_TRIE_COMPILE_r({
3938         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3939                       depth, (UV)numstates
3940         );
3941         for( q_read=1; q_read<numstates; q_read++ ) {
3942             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3943         }
3944         Perl_re_printf( aTHX_  "\n");
3945     });
3946     Safefree(q);
3947     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3948     return stclass;
3949 }
3950
3951
3952 /* The below joins as many adjacent EXACTish nodes as possible into a single
3953  * one.  The regop may be changed if the node(s) contain certain sequences that
3954  * require special handling.  The joining is only done if:
3955  * 1) there is room in the current conglomerated node to entirely contain the
3956  *    next one.
3957  * 2) they are compatible node types
3958  *
3959  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3960  * these get optimized out
3961  *
3962  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3963  * as possible, even if that means splitting an existing node so that its first
3964  * part is moved to the preceeding node.  This would maximise the efficiency of
3965  * memEQ during matching.
3966  *
3967  * If a node is to match under /i (folded), the number of characters it matches
3968  * can be different than its character length if it contains a multi-character
3969  * fold.  *min_subtract is set to the total delta number of characters of the
3970  * input nodes.
3971  *
3972  * And *unfolded_multi_char is set to indicate whether or not the node contains
3973  * an unfolded multi-char fold.  This happens when it won't be known until
3974  * runtime whether the fold is valid or not; namely
3975  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3976  *      target string being matched against turns out to be UTF-8 is that fold
3977  *      valid; or
3978  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3979  *      runtime.
3980  * (Multi-char folds whose components are all above the Latin1 range are not
3981  * run-time locale dependent, and have already been folded by the time this
3982  * function is called.)
3983  *
3984  * This is as good a place as any to discuss the design of handling these
3985  * multi-character fold sequences.  It's been wrong in Perl for a very long
3986  * time.  There are three code points in Unicode whose multi-character folds
3987  * were long ago discovered to mess things up.  The previous designs for
3988  * dealing with these involved assigning a special node for them.  This
3989  * approach doesn't always work, as evidenced by this example:
3990  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3991  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3992  * would match just the \xDF, it won't be able to handle the case where a
3993  * successful match would have to cross the node's boundary.  The new approach
3994  * that hopefully generally solves the problem generates an EXACTFUP node
3995  * that is "sss" in this case.
3996  *
3997  * It turns out that there are problems with all multi-character folds, and not
3998  * just these three.  Now the code is general, for all such cases.  The
3999  * approach taken is:
4000  * 1)   This routine examines each EXACTFish node that could contain multi-
4001  *      character folded sequences.  Since a single character can fold into
4002  *      such a sequence, the minimum match length for this node is less than
4003  *      the number of characters in the node.  This routine returns in
4004  *      *min_subtract how many characters to subtract from the actual
4005  *      length of the string to get a real minimum match length; it is 0 if
4006  *      there are no multi-char foldeds.  This delta is used by the caller to
4007  *      adjust the min length of the match, and the delta between min and max,
4008  *      so that the optimizer doesn't reject these possibilities based on size
4009  *      constraints.
4010  *
4011  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4012  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
4013  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4014  *      EXACTFU nodes.  The node type of such nodes is then changed to
4015  *      EXACTFUP, indicating it is problematic, and needs careful handling.
4016  *      (The procedures in step 1) above are sufficient to handle this case in
4017  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
4018  *      the only case where there is a possible fold length change in non-UTF-8
4019  *      patterns.  By reserving a special node type for problematic cases, the
4020  *      far more common regular EXACTFU nodes can be processed faster.
4021  *      regexec.c takes advantage of this.
4022  *
4023  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4024  *      problematic cases.   These all only occur when the pattern is not
4025  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
4026  *      length change, it handles the situation where the string cannot be
4027  *      entirely folded.  The strings in an EXACTFish node are folded as much
4028  *      as possible during compilation in regcomp.c.  This saves effort in
4029  *      regex matching.  By using an EXACTFUP node when it is not possible to
4030  *      fully fold at compile time, regexec.c can know that everything in an
4031  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
4032  *      case where folding in EXACTFU nodes can't be done at compile time is
4033  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
4034  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
4035  *      handle two very different cases.  Alternatively, there could have been
4036  *      a node type where there are length changes, one for unfolded, and one
4037  *      for both.  If yet another special case needed to be created, the number
4038  *      of required node types would have to go to 7.  khw figures that even
4039  *      though there are plenty of node types to spare, that the maintenance
4040  *      cost wasn't worth the small speedup of doing it that way, especially
4041  *      since he thinks the MICRO SIGN is rarely encountered in practice.
4042  *
4043  *      There are other cases where folding isn't done at compile time, but
4044  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
4045  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
4046  *      changes.  Some folds in EXACTF depend on if the runtime target string
4047  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
4048  *      when no fold in it depends on the UTF-8ness of the target string.)
4049  *
4050  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
4051  *      validity of the fold won't be known until runtime, and so must remain
4052  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
4053  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
4054  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
4055  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4056  *      The reason this is a problem is that the optimizer part of regexec.c
4057  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4058  *      that a character in the pattern corresponds to at most a single
4059  *      character in the target string.  (And I do mean character, and not byte
4060  *      here, unlike other parts of the documentation that have never been
4061  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
4062  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4063  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
4064  *      EXACTFL nodes, violate the assumption, and they are the only instances
4065  *      where it is violated.  I'm reluctant to try to change the assumption,
4066  *      as the code involved is impenetrable to me (khw), so instead the code
4067  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
4068  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4069  *      boolean indicating whether or not the node contains such a fold.  When
4070  *      it is true, the caller sets a flag that later causes the optimizer in
4071  *      this file to not set values for the floating and fixed string lengths,
4072  *      and thus avoids the optimizer code in regexec.c that makes the invalid
4073  *      assumption.  Thus, there is no optimization based on string lengths for
4074  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4075  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
4076  *      assumption is wrong only in these cases is that all other non-UTF-8
4077  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4078  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
4079  *      EXACTF nodes because we don't know at compile time if it actually
4080  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
4081  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
4082  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
4083  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4084  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4085  *      string would require the pattern to be forced into UTF-8, the overhead
4086  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
4087  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4088  *      locale.)
4089  *
4090  *      Similarly, the code that generates tries doesn't currently handle
4091  *      not-already-folded multi-char folds, and it looks like a pain to change
4092  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
4093  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
4094  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
4095  *      using /iaa matching will be doing so almost entirely with ASCII
4096  *      strings, so this should rarely be encountered in practice */
4097
4098 STATIC U32
4099 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4100                    UV *min_subtract, bool *unfolded_multi_char,
4101                    U32 flags, regnode *val, U32 depth)
4102 {
4103     /* Merge several consecutive EXACTish nodes into one. */
4104
4105     regnode *n = regnext(scan);
4106     U32 stringok = 1;
4107     regnode *next = scan + NODE_SZ_STR(scan);
4108     U32 merged = 0;
4109     U32 stopnow = 0;
4110 #ifdef DEBUGGING
4111     regnode *stop = scan;
4112     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4113 #else
4114     PERL_UNUSED_ARG(depth);
4115 #endif
4116
4117     PERL_ARGS_ASSERT_JOIN_EXACT;
4118 #ifndef EXPERIMENTAL_INPLACESCAN
4119     PERL_UNUSED_ARG(flags);
4120     PERL_UNUSED_ARG(val);
4121 #endif
4122     DEBUG_PEEP("join", scan, depth, 0);
4123
4124     assert(PL_regkind[OP(scan)] == EXACT);
4125
4126     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4127      * EXACT ones that are mergeable to the current one. */
4128     while (    n
4129            && (    PL_regkind[OP(n)] == NOTHING
4130                || (stringok && PL_regkind[OP(n)] == EXACT))
4131            && NEXT_OFF(n)
4132            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4133     {
4134
4135         if (OP(n) == TAIL || n > next)
4136             stringok = 0;
4137         if (PL_regkind[OP(n)] == NOTHING) {
4138             DEBUG_PEEP("skip:", n, depth, 0);
4139             NEXT_OFF(scan) += NEXT_OFF(n);
4140             next = n + NODE_STEP_REGNODE;
4141 #ifdef DEBUGGING
4142             if (stringok)
4143                 stop = n;
4144 #endif
4145             n = regnext(n);
4146         }
4147         else if (stringok) {
4148             const unsigned int oldl = STR_LEN(scan);
4149             regnode * const nnext = regnext(n);
4150
4151             /* XXX I (khw) kind of doubt that this works on platforms (should
4152              * Perl ever run on one) where U8_MAX is above 255 because of lots
4153              * of other assumptions */
4154             /* Don't join if the sum can't fit into a single node */
4155             if (oldl + STR_LEN(n) > U8_MAX)
4156                 break;
4157
4158             /* Joining something that requires UTF-8 with something that
4159              * doesn't, means the result requires UTF-8. */
4160             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4161                 OP(scan) = EXACT_REQ8;
4162             }
4163             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4164                 ;   /* join is compatible, no need to change OP */
4165             }
4166             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4167                 OP(scan) = EXACTFU_REQ8;
4168             }
4169             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4170                 ;   /* join is compatible, no need to change OP */
4171             }
4172             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4173                 ;   /* join is compatible, no need to change OP */
4174             }
4175             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4176
4177                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4178                   * which can join with EXACTFU ones.  We check for this case
4179                   * here.  These need to be resolved to either EXACTFU or
4180                   * EXACTF at joining time.  They have nothing in them that
4181                   * would forbid them from being the more desirable EXACTFU
4182                   * nodes except that they begin and/or end with a single [Ss].
4183                   * The reason this is problematic is because they could be
4184                   * joined in this loop with an adjacent node that ends and/or
4185                   * begins with [Ss] which would then form the sequence 'ss',
4186                   * which matches differently under /di than /ui, in which case
4187                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4188                   * formed, the nodes get absorbed into any adjacent EXACTFU
4189                   * node.  And if the only adjacent node is EXACTF, they get
4190                   * absorbed into that, under the theory that a longer node is
4191                   * better than two shorter ones, even if one is EXACTFU.  Note
4192                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4193                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4194
4195                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4196
4197                     /* Here the joined node would end with 's'.  If the node
4198                      * following the combination is an EXACTF one, it's better to
4199                      * join this trailing edge 's' node with that one, leaving the
4200                      * current one in 'scan' be the more desirable EXACTFU */
4201                     if (OP(nnext) == EXACTF) {
4202                         break;
4203                     }
4204
4205                     OP(scan) = EXACTFU_S_EDGE;
4206
4207                 }   /* Otherwise, the beginning 's' of the 2nd node just
4208                        becomes an interior 's' in 'scan' */
4209             }
4210             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4211                 ;   /* join is compatible, no need to change OP */
4212             }
4213             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4214
4215                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4216                  * nodes.  But the latter nodes can be also joined with EXACTFU
4217                  * ones, and that is a better outcome, so if the node following
4218                  * 'n' is EXACTFU, quit now so that those two can be joined
4219                  * later */
4220                 if (OP(nnext) == EXACTFU) {
4221                     break;
4222                 }
4223
4224                 /* The join is compatible, and the combined node will be
4225                  * EXACTF.  (These don't care if they begin or end with 's' */
4226             }
4227             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4228                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4229                     && STRING(n)[0] == 's')
4230                 {
4231                     /* When combined, we have the sequence 'ss', which means we
4232                      * have to remain /di */
4233                     OP(scan) = EXACTF;
4234                 }
4235             }
4236             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4237                 if (STRING(n)[0] == 's') {
4238                     ;   /* Here the join is compatible and the combined node
4239                            starts with 's', no need to change OP */
4240                 }
4241                 else {  /* Now the trailing 's' is in the interior */
4242                     OP(scan) = EXACTFU;
4243                 }
4244             }
4245             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4246
4247                 /* The join is compatible, and the combined node will be
4248                  * EXACTF.  (These don't care if they begin or end with 's' */
4249                 OP(scan) = EXACTF;
4250             }
4251             else if (OP(scan) != OP(n)) {
4252
4253                 /* The only other compatible joinings are the same node type */
4254                 break;
4255             }
4256
4257             DEBUG_PEEP("merg", n, depth, 0);
4258             merged++;
4259
4260             NEXT_OFF(scan) += NEXT_OFF(n);
4261             assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4262             setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4263             next = n + NODE_SZ_STR(n);
4264             /* Now we can overwrite *n : */
4265             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4266 #ifdef DEBUGGING
4267             stop = next - 1;
4268 #endif
4269             n = nnext;
4270             if (stopnow) break;
4271         }
4272
4273 #ifdef EXPERIMENTAL_INPLACESCAN
4274         if (flags && !NEXT_OFF(n)) {
4275             DEBUG_PEEP("atch", val, depth, 0);
4276             if (reg_off_by_arg[OP(n)]) {
4277                 ARG_SET(n, val - n);
4278             }
4279             else {
4280                 NEXT_OFF(n) = val - n;
4281             }
4282             stopnow = 1;
4283         }
4284 #endif
4285     }
4286
4287     /* This temporary node can now be turned into EXACTFU, and must, as
4288      * regexec.c doesn't handle it */
4289     if (OP(scan) == EXACTFU_S_EDGE) {
4290         OP(scan) = EXACTFU;
4291     }
4292
4293     *min_subtract = 0;
4294     *unfolded_multi_char = FALSE;
4295
4296     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4297      * can now analyze for sequences of problematic code points.  (Prior to
4298      * this final joining, sequences could have been split over boundaries, and
4299      * hence missed).  The sequences only happen in folding, hence for any
4300      * non-EXACT EXACTish node */
4301     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4302         U8* s0 = (U8*) STRING(scan);
4303         U8* s = s0;
4304         U8* s_end = s0 + STR_LEN(scan);
4305
4306         int total_count_delta = 0;  /* Total delta number of characters that
4307                                        multi-char folds expand to */
4308
4309         /* One pass is made over the node's string looking for all the
4310          * possibilities.  To avoid some tests in the loop, there are two main
4311          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4312          * non-UTF-8 */
4313         if (UTF) {
4314             U8* folded = NULL;
4315
4316             if (OP(scan) == EXACTFL) {
4317                 U8 *d;
4318
4319                 /* An EXACTFL node would already have been changed to another
4320                  * node type unless there is at least one character in it that
4321                  * is problematic; likely a character whose fold definition
4322                  * won't be known until runtime, and so has yet to be folded.
4323                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4324                  * to handle the UTF-8 case, we need to create a temporary
4325                  * folded copy using UTF-8 locale rules in order to analyze it.
4326                  * This is because our macros that look to see if a sequence is
4327                  * a multi-char fold assume everything is folded (otherwise the
4328                  * tests in those macros would be too complicated and slow).
4329                  * Note that here, the non-problematic folds will have already
4330                  * been done, so we can just copy such characters.  We actually
4331                  * don't completely fold the EXACTFL string.  We skip the
4332                  * unfolded multi-char folds, as that would just create work
4333                  * below to figure out the size they already are */
4334
4335                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4336                 d = folded;
4337                 while (s < s_end) {
4338                     STRLEN s_len = UTF8SKIP(s);
4339                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4340                         Copy(s, d, s_len, U8);
4341                         d += s_len;
4342                     }
4343                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4344                         *unfolded_multi_char = TRUE;
4345                         Copy(s, d, s_len, U8);
4346                         d += s_len;
4347                     }
4348                     else if (isASCII(*s)) {
4349                         *(d++) = toFOLD(*s);
4350                     }
4351                     else {
4352                         STRLEN len;
4353                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4354                         d += len;
4355                     }
4356                     s += s_len;
4357                 }
4358
4359                 /* Point the remainder of the routine to look at our temporary
4360                  * folded copy */
4361                 s = folded;
4362                 s_end = d;
4363             } /* End of creating folded copy of EXACTFL string */
4364
4365             /* Examine the string for a multi-character fold sequence.  UTF-8
4366              * patterns have all characters pre-folded by the time this code is
4367              * executed */
4368             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4369                                      length sequence we are looking for is 2 */
4370             {
4371                 int count = 0;  /* How many characters in a multi-char fold */
4372                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4373                 if (! len) {    /* Not a multi-char fold: get next char */
4374                     s += UTF8SKIP(s);
4375                     continue;
4376                 }
4377
4378                 { /* Here is a generic multi-char fold. */
4379                     U8* multi_end  = s + len;
4380
4381                     /* Count how many characters are in it.  In the case of
4382                      * /aa, no folds which contain ASCII code points are
4383                      * allowed, so check for those, and skip if found. */
4384                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4385                         count = utf8_length(s, multi_end);
4386                         s = multi_end;
4387                     }
4388                     else {
4389                         while (s < multi_end) {
4390                             if (isASCII(*s)) {
4391                                 s++;
4392                                 goto next_iteration;
4393                             }
4394                             else {
4395                                 s += UTF8SKIP(s);
4396                             }
4397                             count++;
4398                         }
4399                     }
4400                 }
4401
4402                 /* The delta is how long the sequence is minus 1 (1 is how long
4403                  * the character that folds to the sequence is) */
4404                 total_count_delta += count - 1;
4405               next_iteration: ;
4406             }
4407
4408             /* We created a temporary folded copy of the string in EXACTFL
4409              * nodes.  Therefore we need to be sure it doesn't go below zero,
4410              * as the real string could be shorter */
4411             if (OP(scan) == EXACTFL) {
4412                 int total_chars = utf8_length((U8*) STRING(scan),
4413                                            (U8*) STRING(scan) + STR_LEN(scan));
4414                 if (total_count_delta > total_chars) {
4415                     total_count_delta = total_chars;
4416                 }
4417             }
4418
4419             *min_subtract += total_count_delta;
4420             Safefree(folded);
4421         }
4422         else if (OP(scan) == EXACTFAA) {
4423
4424             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4425              * fold to the ASCII range (and there are no existing ones in the
4426              * upper latin1 range).  But, as outlined in the comments preceding
4427              * this function, we need to flag any occurrences of the sharp s.
4428              * This character forbids trie formation (because of added
4429              * complexity) */
4430 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4431    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4432                                       || UNICODE_DOT_DOT_VERSION > 0)
4433             while (s < s_end) {
4434                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4435                     OP(scan) = EXACTFAA_NO_TRIE;
4436                     *unfolded_multi_char = TRUE;
4437                     break;
4438                 }
4439                 s++;
4440             }
4441         }
4442         else if (OP(scan) != EXACTFAA_NO_TRIE) {
4443
4444             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4445              * folds that are all Latin1.  As explained in the comments
4446              * preceding this function, we look also for the sharp s in EXACTF
4447              * and EXACTFL nodes; it can be in the final position.  Otherwise
4448              * we can stop looking 1 byte earlier because have to find at least
4449              * two characters for a multi-fold */
4450             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4451                               ? s_end
4452                               : s_end -1;
4453
4454             while (s < upper) {
4455                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4456                 if (! len) {    /* Not a multi-char fold. */
4457                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4458                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4459                     {
4460                         *unfolded_multi_char = TRUE;
4461                     }
4462                     s++;
4463                     continue;
4464                 }
4465
4466                 if (len == 2
4467                     && isALPHA_FOLD_EQ(*s, 's')
4468                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4469                 {
4470
4471                     /* EXACTF nodes need to know that the minimum length
4472                      * changed so that a sharp s in the string can match this
4473                      * ss in the pattern, but they remain EXACTF nodes, as they
4474                      * won't match this unless the target string is in UTF-8,
4475                      * which we don't know until runtime.  EXACTFL nodes can't
4476                      * transform into EXACTFU nodes */
4477                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4478                         OP(scan) = EXACTFUP;
4479                     }
4480                 }
4481
4482                 *min_subtract += len - 1;
4483                 s += len;
4484             }
4485 #endif
4486         }
4487     }
4488
4489 #ifdef DEBUGGING
4490     /* Allow dumping but overwriting the collection of skipped
4491      * ops and/or strings with fake optimized ops */
4492     n = scan + NODE_SZ_STR(scan);
4493     while (n <= stop) {
4494         OP(n) = OPTIMIZED;
4495         FLAGS(n) = 0;
4496         NEXT_OFF(n) = 0;
4497         n++;
4498     }
4499 #endif
4500     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4501     return stopnow;
4502 }
4503
4504 /* REx optimizer.  Converts nodes into quicker variants "in place".
4505    Finds fixed substrings.  */
4506
4507 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4508    to the position after last scanned or to NULL. */
4509
4510 #define INIT_AND_WITHP \
4511     assert(!and_withp); \
4512     Newx(and_withp, 1, regnode_ssc); \
4513     SAVEFREEPV(and_withp)
4514
4515
4516 static void
4517 S_unwind_scan_frames(pTHX_ const void *p)
4518 {
4519     scan_frame *f= (scan_frame *)p;
4520     do {
4521         scan_frame *n= f->next_frame;
4522         Safefree(f);
4523         f= n;
4524     } while (f);
4525 }
4526
4527 /* Follow the next-chain of the current node and optimize away
4528    all the NOTHINGs from it.
4529  */
4530 STATIC void
4531 S_rck_elide_nothing(pTHX_ regnode *node)
4532 {
4533     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4534
4535     if (OP(node) != CURLYX) {
4536         const int max = (reg_off_by_arg[OP(node)]
4537                         ? I32_MAX
4538                           /* I32 may be smaller than U16 on CRAYs! */
4539                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4540         int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4541         int noff;
4542         regnode *n = node;
4543
4544         /* Skip NOTHING and LONGJMP. */
4545         while (
4546             (n = regnext(n))
4547             && (
4548                 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4549                 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4550             )
4551             && off + noff < max
4552         ) {
4553             off += noff;
4554         }
4555         if (reg_off_by_arg[OP(node)])
4556             ARG(node) = off;
4557         else
4558             NEXT_OFF(node) = off;
4559     }
4560     return;
4561 }
4562
4563 /* the return from this sub is the minimum length that could possibly match */
4564 STATIC SSize_t
4565 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4566                         SSize_t *minlenp, SSize_t *deltap,
4567                         regnode *last,
4568                         scan_data_t *data,
4569                         I32 stopparen,
4570                         U32 recursed_depth,
4571                         regnode_ssc *and_withp,
4572                         U32 flags, U32 depth, bool was_mutate_ok)
4573                         /* scanp: Start here (read-write). */
4574                         /* deltap: Write maxlen-minlen here. */
4575                         /* last: Stop before this one. */
4576                         /* data: string data about the pattern */
4577                         /* stopparen: treat close N as END */
4578                         /* recursed: which subroutines have we recursed into */
4579                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4580 {
4581     SSize_t final_minlen;
4582     /* There must be at least this number of characters to match */
4583     SSize_t min = 0;
4584     I32 pars = 0, code;
4585     regnode *scan = *scanp, *next;
4586     SSize_t delta = 0;
4587     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4588     int is_inf_internal = 0;            /* The studied chunk is infinite */
4589     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4590     scan_data_t data_fake;
4591     SV *re_trie_maxbuff = NULL;
4592     regnode *first_non_open = scan;
4593     SSize_t stopmin = OPTIMIZE_INFTY;
4594     scan_frame *frame = NULL;
4595     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4596
4597     PERL_ARGS_ASSERT_STUDY_CHUNK;
4598     RExC_study_started= 1;
4599
4600     Zero(&data_fake, 1, scan_data_t);
4601
4602     if ( depth == 0 ) {
4603         while (first_non_open && OP(first_non_open) == OPEN)
4604             first_non_open=regnext(first_non_open);
4605     }
4606
4607
4608   fake_study_recurse:
4609     DEBUG_r(
4610         RExC_study_chunk_recursed_count++;
4611     );
4612     DEBUG_OPTIMISE_MORE_r(
4613     {
4614         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4615             depth, (long)stopparen,
4616             (unsigned long)RExC_study_chunk_recursed_count,
4617             (unsigned long)depth, (unsigned long)recursed_depth,
4618             scan,
4619             last);
4620         if (recursed_depth) {
4621             U32 i;
4622             U32 j;
4623             for ( j = 0 ; j < recursed_depth ; j++ ) {
4624                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4625                     if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4626                         Perl_re_printf( aTHX_ " %d",(int)i);
4627                         break;
4628                     }
4629                 }
4630                 if ( j + 1 < recursed_depth ) {
4631                     Perl_re_printf( aTHX_  ",");
4632                 }
4633             }
4634         }
4635         Perl_re_printf( aTHX_ "\n");
4636     }
4637     );
4638     while ( scan && OP(scan) != END && scan < last ){
4639         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4640                                    node length to get a real minimum (because
4641                                    the folded version may be shorter) */
4642         bool unfolded_multi_char = FALSE;
4643         /* avoid mutating ops if we are anywhere within the recursed or
4644          * enframed handling for a GOSUB: the outermost level will handle it.
4645          */
4646         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4647         /* Peephole optimizer: */
4648         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4649         DEBUG_PEEP("Peep", scan, depth, flags);
4650
4651
4652         /* The reason we do this here is that we need to deal with things like
4653          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4654          * parsing code, as each (?:..) is handled by a different invocation of
4655          * reg() -- Yves
4656          */
4657         if (PL_regkind[OP(scan)] == EXACT
4658             && OP(scan) != LEXACT
4659             && OP(scan) != LEXACT_REQ8
4660             && mutate_ok
4661         ) {
4662             join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4663                     0, NULL, depth + 1);
4664         }
4665
4666         /* Follow the next-chain of the current node and optimize
4667            away all the NOTHINGs from it.
4668          */
4669         rck_elide_nothing(scan);
4670
4671         /* The principal pseudo-switch.  Cannot be a switch, since we look into
4672          * several different things.  */
4673         if ( OP(scan) == DEFINEP ) {
4674             SSize_t minlen = 0;
4675             SSize_t deltanext = 0;
4676             SSize_t fake_last_close = 0;
4677             I32 f = SCF_IN_DEFINE;
4678
4679             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4680             scan = regnext(scan);
4681             assert( OP(scan) == IFTHEN );
4682             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4683
4684             data_fake.last_closep= &fake_last_close;
4685             minlen = *minlenp;
4686             next = regnext(scan);
4687             scan = NEXTOPER(NEXTOPER(scan));
4688             DEBUG_PEEP("scan", scan, depth, flags);
4689             DEBUG_PEEP("next", next, depth, flags);
4690
4691             /* we suppose the run is continuous, last=next...
4692              * NOTE we dont use the return here! */
4693             /* DEFINEP study_chunk() recursion */
4694             (void)study_chunk(pRExC_state, &scan, &minlen,
4695                               &deltanext, next, &data_fake, stopparen,
4696                               recursed_depth, NULL, f, depth+1, mutate_ok);
4697
4698             scan = next;
4699         } else
4700         if (
4701             OP(scan) == BRANCH  ||
4702             OP(scan) == BRANCHJ ||
4703             OP(scan) == IFTHEN
4704         ) {
4705             next = regnext(scan);
4706             code = OP(scan);
4707
4708             /* The op(next)==code check below is to see if we
4709              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4710              * IFTHEN is special as it might not appear in pairs.
4711              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4712              * we dont handle it cleanly. */
4713             if (OP(next) == code || code == IFTHEN) {
4714                 /* NOTE - There is similar code to this block below for
4715                  * handling TRIE nodes on a re-study.  If you change stuff here
4716                  * check there too. */
4717                 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4718                 regnode_ssc accum;
4719                 regnode * const startbranch=scan;
4720
4721                 if (flags & SCF_DO_SUBSTR) {
4722                     /* Cannot merge strings after this. */
4723                     scan_commit(pRExC_state, data, minlenp, is_inf);
4724                 }
4725
4726                 if (flags & SCF_DO_STCLASS)
4727                     ssc_init_zero(pRExC_state, &accum);
4728
4729                 while (OP(scan) == code) {
4730                     SSize_t deltanext, minnext, fake;
4731                     I32 f = 0;
4732                     regnode_ssc this_class;
4733
4734                     DEBUG_PEEP("Branch", scan, depth, flags);
4735
4736                     num++;
4737                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4738                     if (data) {
4739                         data_fake.whilem_c = data->whilem_c;
4740                         data_fake.last_closep = data->last_closep;
4741                     }
4742                     else
4743                         data_fake.last_closep = &fake;
4744
4745                     data_fake.pos_delta = delta;
4746                     next = regnext(scan);
4747
4748                     scan = NEXTOPER(scan); /* everything */
4749                     if (code != BRANCH)    /* everything but BRANCH */
4750                         scan = NEXTOPER(scan);
4751
4752                     if (flags & SCF_DO_STCLASS) {
4753                         ssc_init(pRExC_state, &this_class);
4754                         data_fake.start_class = &this_class;
4755                         f = SCF_DO_STCLASS_AND;
4756                     }
4757                     if (flags & SCF_WHILEM_VISITED_POS)
4758                         f |= SCF_WHILEM_VISITED_POS;
4759
4760                     /* we suppose the run is continuous, last=next...*/
4761                     /* recurse study_chunk() for each BRANCH in an alternation */
4762                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4763                                       &deltanext, next, &data_fake, stopparen,
4764                                       recursed_depth, NULL, f, depth+1,
4765                                       mutate_ok);
4766
4767                     if (min1 > minnext)
4768                         min1 = minnext;
4769                     if (deltanext == OPTIMIZE_INFTY) {
4770                         is_inf = is_inf_internal = 1;
4771                         max1 = OPTIMIZE_INFTY;
4772                     } else if (max1 < minnext + deltanext)
4773                         max1 = minnext + deltanext;
4774                     scan = next;
4775                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4776                         pars++;
4777                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4778                         if ( stopmin > minnext)
4779                             stopmin = min + min1;
4780                         flags &= ~SCF_DO_SUBSTR;
4781                         if (data)
4782                             data->flags |= SCF_SEEN_ACCEPT;
4783                     }
4784                     if (data) {
4785                         if (data_fake.flags & SF_HAS_EVAL)
4786                             data->flags |= SF_HAS_EVAL;
4787                         data->whilem_c = data_fake.whilem_c;
4788                     }
4789                     if (flags & SCF_DO_STCLASS)
4790                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4791                 }
4792                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4793                     min1 = 0;
4794                 if (flags & SCF_DO_SUBSTR) {
4795                     data->pos_min += min1;
4796                     if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4797                         data->pos_delta = OPTIMIZE_INFTY;
4798                     else
4799                         data->pos_delta += max1 - min1;
4800                     if (max1 != min1 || is_inf)
4801                         data->cur_is_floating = 1;
4802                 }
4803                 min += min1;
4804                 if (delta == OPTIMIZE_INFTY
4805                  || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4806                     delta = OPTIMIZE_INFTY;
4807                 else
4808                     delta += max1 - min1;
4809                 if (flags & SCF_DO_STCLASS_OR) {
4810                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4811                     if (min1) {
4812                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4813                         flags &= ~SCF_DO_STCLASS;
4814                     }
4815                 }
4816                 else if (flags & SCF_DO_STCLASS_AND) {
4817                     if (min1) {
4818                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4819                         flags &= ~SCF_DO_STCLASS;
4820                     }
4821                     else {
4822                         /* Switch to OR mode: cache the old value of
4823                          * data->start_class */
4824                         INIT_AND_WITHP;
4825                         StructCopy(data->start_class, and_withp, regnode_ssc);
4826                         flags &= ~SCF_DO_STCLASS_AND;
4827                         StructCopy(&accum, data->start_class, regnode_ssc);
4828                         flags |= SCF_DO_STCLASS_OR;
4829                     }
4830                 }
4831
4832                 if (PERL_ENABLE_TRIE_OPTIMISATION
4833                     && OP(startbranch) == BRANCH
4834                     && mutate_ok
4835                 ) {
4836                 /* demq.
4837
4838                    Assuming this was/is a branch we are dealing with: 'scan'
4839                    now points at the item that follows the branch sequence,
4840                    whatever it is. We now start at the beginning of the
4841                    sequence and look for subsequences of
4842
4843                    BRANCH->EXACT=>x1
4844                    BRANCH->EXACT=>x2
4845                    tail
4846
4847                    which would be constructed from a pattern like
4848                    /A|LIST|OF|WORDS/
4849
4850                    If we can find such a subsequence we need to turn the first
4851                    element into a trie and then add the subsequent branch exact
4852                    strings to the trie.
4853
4854                    We have two cases
4855
4856                      1. patterns where the whole set of branches can be
4857                         converted.
4858
4859                      2. patterns where only a subset can be converted.
4860
4861                    In case 1 we can replace the whole set with a single regop
4862                    for the trie. In case 2 we need to keep the start and end
4863                    branches so
4864
4865                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4866                      becomes BRANCH TRIE; BRANCH X;
4867
4868                   There is an additional case, that being where there is a
4869                   common prefix, which gets split out into an EXACT like node
4870                   preceding the TRIE node.
4871
4872                   If x(1..n)==tail then we can do a simple trie, if not we make
4873                   a "jump" trie, such that when we match the appropriate word
4874                   we "jump" to the appropriate tail node. Essentially we turn
4875                   a nested if into a case structure of sorts.
4876
4877                 */
4878
4879                     int made=0;
4880                     if (!re_trie_maxbuff) {
4881                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4882                         if (!SvIOK(re_trie_maxbuff))
4883                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4884                     }
4885                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4886                         regnode *cur;
4887                         regnode *first = (regnode *)NULL;
4888                         regnode *prev = (regnode *)NULL;
4889                         regnode *tail = scan;
4890                         U8 trietype = 0;
4891                         U32 count=0;
4892
4893                         /* var tail is used because there may be a TAIL
4894                            regop in the way. Ie, the exacts will point to the
4895                            thing following the TAIL, but the last branch will
4896                            point at the TAIL. So we advance tail. If we
4897                            have nested (?:) we may have to move through several
4898                            tails.
4899                          */
4900
4901                         while ( OP( tail ) == TAIL ) {
4902                             /* this is the TAIL generated by (?:) */
4903                             tail = regnext( tail );
4904                         }
4905
4906
4907                         DEBUG_TRIE_COMPILE_r({
4908                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4909                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4910                               depth+1,
4911                               "Looking for TRIE'able sequences. Tail node is ",
4912                               (UV) REGNODE_OFFSET(tail),
4913                               SvPV_nolen_const( RExC_mysv )
4914                             );
4915                         });
4916
4917                         /*
4918
4919                             Step through the branches
4920                                 cur represents each branch,
4921                                 noper is the first thing to be matched as part
4922                                       of that branch
4923                                 noper_next is the regnext() of that node.
4924
4925                             We normally handle a case like this
4926                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4927                             support building with NOJUMPTRIE, which restricts
4928                             the trie logic to structures like /FOO|BAR/.
4929
4930                             If noper is a trieable nodetype then the branch is
4931                             a possible optimization target. If we are building
4932                             under NOJUMPTRIE then we require that noper_next is
4933                             the same as scan (our current position in the regex
4934                             program).
4935
4936                             Once we have two or more consecutive such branches
4937                             we can create a trie of the EXACT's contents and
4938                             stitch it in place into the program.
4939
4940                             If the sequence represents all of the branches in
4941                             the alternation we replace the entire thing with a
4942                             single TRIE node.
4943
4944                             Otherwise when it is a subsequence we need to
4945                             stitch it in place and replace only the relevant
4946                             branches. This means the first branch has to remain
4947                             as it is used by the alternation logic, and its
4948                             next pointer, and needs to be repointed at the item
4949                             on the branch chain following the last branch we
4950                             have optimized away.
4951
4952                             This could be either a BRANCH, in which case the
4953                             subsequence is internal, or it could be the item
4954                             following the branch sequence in which case the
4955                             subsequence is at the end (which does not
4956                             necessarily mean the first node is the start of the
4957                             alternation).
4958
4959                             TRIE_TYPE(X) is a define which maps the optype to a
4960                             trietype.
4961
4962                                 optype          |  trietype
4963                                 ----------------+-----------
4964                                 NOTHING         | NOTHING
4965                                 EXACT           | EXACT
4966                                 EXACT_REQ8     | EXACT
4967                                 EXACTFU         | EXACTFU
4968                                 EXACTFU_REQ8   | EXACTFU
4969                                 EXACTFUP        | EXACTFU
4970                                 EXACTFAA        | EXACTFAA
4971                                 EXACTL          | EXACTL
4972                                 EXACTFLU8       | EXACTFLU8
4973
4974
4975                         */
4976 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4977                        ? NOTHING                                            \
4978                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
4979                          ? EXACT                                            \
4980                          : (     EXACTFU == (X)                             \
4981                               || EXACTFU_REQ8 == (X)                       \
4982                               || EXACTFUP == (X) )                          \
4983                            ? EXACTFU                                        \
4984                            : ( EXACTFAA == (X) )                            \
4985                              ? EXACTFAA                                     \
4986                              : ( EXACTL == (X) )                            \
4987                                ? EXACTL                                     \
4988                                : ( EXACTFLU8 == (X) )                       \
4989                                  ? EXACTFLU8                                \
4990                                  : 0 )
4991
4992                         /* dont use tail as the end marker for this traverse */
4993                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4994                             regnode * const noper = NEXTOPER( cur );
4995                             U8 noper_type = OP( noper );
4996                             U8 noper_trietype = TRIE_TYPE( noper_type );
4997 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4998                             regnode * const noper_next = regnext( noper );
4999                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5000                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
5001 #endif
5002
5003                             DEBUG_TRIE_COMPILE_r({
5004                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5005                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
5006                                    depth+1,
5007                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5008
5009                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5010                                 Perl_re_printf( aTHX_  " -> %d:%s",
5011                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5012
5013                                 if ( noper_next ) {
5014                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5015                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5016                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5017                                 }
5018                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5019                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5020                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5021                                 );
5022                             });
5023
5024                             /* Is noper a trieable nodetype that can be merged
5025                              * with the current trie (if there is one)? */
5026                             if ( noper_trietype
5027                                   &&
5028                                   (
5029                                         ( noper_trietype == NOTHING )
5030                                         || ( trietype == NOTHING )
5031                                         || ( trietype == noper_trietype )
5032                                   )
5033 #ifdef NOJUMPTRIE
5034                                   && noper_next >= tail
5035 #endif
5036                                   && count < U16_MAX)
5037                             {
5038                                 /* Handle mergable triable node Either we are
5039                                  * the first node in a new trieable sequence,
5040                                  * in which case we do some bookkeeping,
5041                                  * otherwise we update the end pointer. */
5042                                 if ( !first ) {
5043                                     first = cur;
5044                                     if ( noper_trietype == NOTHING ) {
5045 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5046                                         regnode * const noper_next = regnext( noper );
5047                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5048                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5049 #endif
5050
5051                                         if ( noper_next_trietype ) {
5052                                             trietype = noper_next_trietype;
5053                                         } else if (noper_next_type)  {
5054                                             /* a NOTHING regop is 1 regop wide.
5055                                              * We need at least two for a trie
5056                                              * so we can't merge this in */
5057                                             first = NULL;
5058                                         }
5059                                     } else {
5060                                         trietype = noper_trietype;
5061                                     }
5062                                 } else {
5063                                     if ( trietype == NOTHING )
5064                                         trietype = noper_trietype;
5065                                     prev = cur;
5066                                 }
5067                                 if (first)
5068                                     count++;
5069                             } /* end handle mergable triable node */
5070                             else {
5071                                 /* handle unmergable node -
5072                                  * noper may either be a triable node which can
5073                                  * not be tried together with the current trie,
5074                                  * or a non triable node */
5075                                 if ( prev ) {
5076                                     /* If last is set and trietype is not
5077                                      * NOTHING then we have found at least two
5078                                      * triable branch sequences in a row of a
5079                                      * similar trietype so we can turn them
5080                                      * into a trie. If/when we allow NOTHING to
5081                                      * start a trie sequence this condition
5082                                      * will be required, and it isn't expensive
5083                                      * so we leave it in for now. */
5084                                     if ( trietype && trietype != NOTHING )
5085                                         make_trie( pRExC_state,
5086                                                 startbranch, first, cur, tail,
5087                                                 count, trietype, depth+1 );
5088                                     prev = NULL; /* note: we clear/update
5089                                                     first, trietype etc below,
5090                                                     so we dont do it here */
5091                                 }
5092                                 if ( noper_trietype
5093 #ifdef NOJUMPTRIE
5094                                      && noper_next >= tail
5095 #endif
5096                                 ){
5097                                     /* noper is triable, so we can start a new
5098                                      * trie sequence */
5099                                     count = 1;
5100                                     first = cur;
5101                                     trietype = noper_trietype;
5102                                 } else if (first) {
5103                                     /* if we already saw a first but the
5104                                      * current node is not triable then we have
5105                                      * to reset the first information. */
5106                                     count = 0;
5107                                     first = NULL;
5108                                     trietype = 0;
5109                                 }
5110                             } /* end handle unmergable node */
5111                         } /* loop over branches */
5112                         DEBUG_TRIE_COMPILE_r({
5113                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5114                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5115                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5116                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5117                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5118                                PL_reg_name[trietype]
5119                             );
5120
5121                         });
5122                         if ( prev && trietype ) {
5123                             if ( trietype != NOTHING ) {
5124                                 /* the last branch of the sequence was part of
5125                                  * a trie, so we have to construct it here
5126                                  * outside of the loop */
5127                                 made= make_trie( pRExC_state, startbranch,
5128                                                  first, scan, tail, count,
5129                                                  trietype, depth+1 );
5130 #ifdef TRIE_STUDY_OPT
5131                                 if ( ((made == MADE_EXACT_TRIE &&
5132                                      startbranch == first)
5133                                      || ( first_non_open == first )) &&
5134                                      depth==0 ) {
5135                                     flags |= SCF_TRIE_RESTUDY;
5136                                     if ( startbranch == first
5137                                          && scan >= tail )
5138                                     {
5139                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5140                                     }
5141                                 }
5142 #endif
5143                             } else {
5144                                 /* at this point we know whatever we have is a
5145                                  * NOTHING sequence/branch AND if 'startbranch'
5146                                  * is 'first' then we can turn the whole thing
5147                                  * into a NOTHING
5148                                  */
5149                                 if ( startbranch == first ) {
5150                                     regnode *opt;
5151                                     /* the entire thing is a NOTHING sequence,
5152                                      * something like this: (?:|) So we can
5153                                      * turn it into a plain NOTHING op. */
5154                                     DEBUG_TRIE_COMPILE_r({
5155                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5156                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5157                                           depth+1,
5158                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5159
5160                                     });
5161                                     OP(startbranch)= NOTHING;
5162                                     NEXT_OFF(startbranch)= tail - startbranch;
5163                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5164                                         OP(opt)= OPTIMIZED;
5165                                 }
5166                             }
5167                         } /* end if ( prev) */
5168                     } /* TRIE_MAXBUF is non zero */
5169                 } /* do trie */
5170
5171             }
5172             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5173                 scan = NEXTOPER(NEXTOPER(scan));
5174             } else                      /* single branch is optimized. */
5175                 scan = NEXTOPER(scan);
5176             continue;
5177         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5178             I32 paren = 0;
5179             regnode *start = NULL;
5180             regnode *end = NULL;
5181             U32 my_recursed_depth= recursed_depth;
5182
5183             if (OP(scan) != SUSPEND) { /* GOSUB */
5184                 /* Do setup, note this code has side effects beyond
5185                  * the rest of this block. Specifically setting
5186                  * RExC_recurse[] must happen at least once during
5187                  * study_chunk(). */
5188                 paren = ARG(scan);
5189                 RExC_recurse[ARG2L(scan)] = scan;
5190                 start = REGNODE_p(RExC_open_parens[paren]);
5191                 end   = REGNODE_p(RExC_close_parens[paren]);
5192
5193                 /* NOTE we MUST always execute the above code, even
5194                  * if we do nothing with a GOSUB */
5195                 if (
5196                     ( flags & SCF_IN_DEFINE )
5197                     ||
5198                     (
5199                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5200                         &&
5201                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5202                     )
5203                 ) {
5204                     /* no need to do anything here if we are in a define. */
5205                     /* or we are after some kind of infinite construct
5206                      * so we can skip recursing into this item.
5207                      * Since it is infinite we will not change the maxlen
5208                      * or delta, and if we miss something that might raise
5209                      * the minlen it will merely pessimise a little.
5210                      *
5211                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5212                      * might result in a minlen of 1 and not of 4,
5213                      * but this doesn't make us mismatch, just try a bit
5214                      * harder than we should.
5215                      * */
5216                     scan= regnext(scan);
5217                     continue;
5218                 }
5219
5220                 if (
5221                     !recursed_depth
5222                     || !PAREN_TEST(recursed_depth - 1, paren)
5223                 ) {
5224                     /* it is quite possible that there are more efficient ways
5225                      * to do this. We maintain a bitmap per level of recursion
5226                      * of which patterns we have entered so we can detect if a
5227                      * pattern creates a possible infinite loop. When we
5228                      * recurse down a level we copy the previous levels bitmap
5229                      * down. When we are at recursion level 0 we zero the top
5230                      * level bitmap. It would be nice to implement a different
5231                      * more efficient way of doing this. In particular the top
5232                      * level bitmap may be unnecessary.
5233                      */
5234                     if (!recursed_depth) {
5235                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5236                     } else {
5237                         Copy(PAREN_OFFSET(recursed_depth - 1),
5238                              PAREN_OFFSET(recursed_depth),
5239                              RExC_study_chunk_recursed_bytes, U8);
5240                     }
5241                     /* we havent recursed into this paren yet, so recurse into it */
5242                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5243                     PAREN_SET(recursed_depth, paren);
5244                     my_recursed_depth= recursed_depth + 1;
5245                 } else {
5246                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5247                     /* some form of infinite recursion, assume infinite length
5248                      * */
5249                     if (flags & SCF_DO_SUBSTR) {
5250                         scan_commit(pRExC_state, data, minlenp, is_inf);
5251                         data->cur_is_floating = 1;
5252                     }
5253                     is_inf = is_inf_internal = 1;
5254                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5255                         ssc_anything(data->start_class);
5256                     flags &= ~SCF_DO_STCLASS;
5257
5258                     start= NULL; /* reset start so we dont recurse later on. */
5259                 }
5260             } else {
5261                 paren = stopparen;
5262                 start = scan + 2;
5263                 end = regnext(scan);
5264             }
5265             if (start) {
5266                 scan_frame *newframe;
5267                 assert(end);
5268                 if (!RExC_frame_last) {
5269                     Newxz(newframe, 1, scan_frame);
5270                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5271                     RExC_frame_head= newframe;
5272                     RExC_frame_count++;
5273                 } else if (!RExC_frame_last->next_frame) {
5274                     Newxz(newframe, 1, scan_frame);
5275                     RExC_frame_last->next_frame= newframe;
5276                     newframe->prev_frame= RExC_frame_last;
5277                     RExC_frame_count++;
5278                 } else {
5279                     newframe= RExC_frame_last->next_frame;
5280                 }
5281                 RExC_frame_last= newframe;
5282
5283                 newframe->next_regnode = regnext(scan);
5284                 newframe->last_regnode = last;
5285                 newframe->stopparen = stopparen;
5286                 newframe->prev_recursed_depth = recursed_depth;
5287                 newframe->this_prev_frame= frame;
5288                 newframe->in_gosub = (
5289                     (frame && frame->in_gosub) || OP(scan) == GOSUB
5290                 );
5291
5292                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5293                 DEBUG_PEEP("fnew", scan, depth, flags);
5294
5295                 frame = newframe;
5296                 scan =  start;
5297                 stopparen = paren;
5298                 last = end;
5299                 depth = depth + 1;
5300                 recursed_depth= my_recursed_depth;
5301
5302                 continue;
5303             }
5304         }
5305         else if (   OP(scan) == EXACT
5306                  || OP(scan) == LEXACT
5307                  || OP(scan) == EXACT_REQ8
5308                  || OP(scan) == LEXACT_REQ8
5309                  || OP(scan) == EXACTL)
5310         {
5311             SSize_t bytelen = STR_LEN(scan), charlen;
5312             UV uc;
5313             assert(bytelen);
5314             if (UTF) {
5315                 const U8 * const s = (U8*)STRING(scan);
5316                 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5317                 charlen = utf8_length(s, s + bytelen);
5318             } else {
5319                 uc = *((U8*)STRING(scan));
5320                 charlen = bytelen;
5321             }
5322             min += charlen;
5323             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5324                 /* The code below prefers earlier match for fixed
5325                    offset, later match for variable offset.  */
5326                 if (data->last_end == -1) { /* Update the start info. */
5327                     data->last_start_min = data->pos_min;
5328                     data->last_start_max =
5329                         is_inf ? OPTIMIZE_INFTY
5330                         : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5331                             ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5332                 }
5333                 sv_catpvn(data->last_found, STRING(scan), bytelen);
5334                 if (UTF)
5335                     SvUTF8_on(data->last_found);
5336                 {
5337                     SV * const sv = data->last_found;
5338                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5339                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5340                     if (mg && mg->mg_len >= 0)
5341                         mg->mg_len += charlen;
5342                 }
5343                 data->last_end = data->pos_min + charlen;
5344                 data->pos_min += charlen; /* As in the first entry. */
5345                 data->flags &= ~SF_BEFORE_EOL;
5346             }
5347
5348             /* ANDing the code point leaves at most it, and not in locale, and
5349              * can't match null string */
5350             if (flags & SCF_DO_STCLASS_AND) {
5351                 ssc_cp_and(data->start_class, uc);
5352                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5353                 ssc_clear_locale(data->start_class);
5354             }
5355             else if (flags & SCF_DO_STCLASS_OR) {
5356                 ssc_add_cp(data->start_class, uc);
5357                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5358
5359                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5360                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5361             }
5362             flags &= ~SCF_DO_STCLASS;
5363         }
5364         else if (PL_regkind[OP(scan)] == EXACT) {
5365             /* But OP != EXACT!, so is EXACTFish */
5366             SSize_t bytelen = STR_LEN(scan), charlen;
5367             const U8 * s = (U8*)STRING(scan);
5368
5369             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5370              * with the mask set to the complement of the bit that differs
5371              * between upper and lower case, and the lowest code point of the
5372              * pair (which the '&' forces) */
5373             if (     bytelen == 1
5374                 &&   isALPHA_A(*s)
5375                 &&  (         OP(scan) == EXACTFAA
5376                      || (     OP(scan) == EXACTFU
5377                          && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5378                 &&   mutate_ok
5379             ) {
5380                 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5381
5382                 OP(scan) = ANYOFM;
5383                 ARG_SET(scan, *s & mask);
5384                 FLAGS(scan) = mask;
5385                 /* we're not EXACTFish any more, so restudy */
5386                 continue;
5387             }
5388
5389             /* Search for fixed substrings supports EXACT only. */
5390             if (flags & SCF_DO_SUBSTR) {
5391                 assert(data);
5392                 scan_commit(pRExC_state, data, minlenp, is_inf);
5393             }
5394             charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5395             if (unfolded_multi_char) {
5396                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5397             }
5398             min += charlen - min_subtract;
5399             assert (min >= 0);
5400             delta += min_subtract;
5401             if (flags & SCF_DO_SUBSTR) {
5402                 data->pos_min += charlen - min_subtract;
5403                 if (data->pos_min < 0) {
5404                     data->pos_min = 0;
5405                 }
5406                 data->pos_delta += min_subtract;
5407                 if (min_subtract) {
5408                     data->cur_is_floating = 1; /* float */
5409                 }
5410             }
5411
5412             if (flags & SCF_DO_STCLASS) {
5413                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5414
5415                 assert(EXACTF_invlist);
5416                 if (flags & SCF_DO_STCLASS_AND) {
5417                     if (OP(scan) != EXACTFL)
5418                         ssc_clear_locale(data->start_class);
5419                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5420                     ANYOF_POSIXL_ZERO(data->start_class);
5421                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5422                 }
5423                 else {  /* SCF_DO_STCLASS_OR */
5424                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5425                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5426
5427                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5428                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5429                 }
5430                 flags &= ~SCF_DO_STCLASS;
5431                 SvREFCNT_dec(EXACTF_invlist);
5432             }
5433         }
5434         else if (REGNODE_VARIES(OP(scan))) {
5435             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5436             I32 fl = 0, f = flags;
5437             regnode * const oscan = scan;
5438             regnode_ssc this_class;
5439             regnode_ssc *oclass = NULL;
5440             I32 next_is_eval = 0;
5441
5442             switch (PL_regkind[OP(scan)]) {
5443             case WHILEM:                /* End of (?:...)* . */
5444                 scan = NEXTOPER(scan);
5445                 goto finish;
5446             case PLUS:
5447                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5448                     next = NEXTOPER(scan);
5449                     if (   OP(next) == EXACT
5450                         || OP(next) == LEXACT
5451                         || OP(next) == EXACT_REQ8
5452                         || OP(next) == LEXACT_REQ8
5453                         || OP(next) == EXACTL
5454                         || (flags & SCF_DO_STCLASS))
5455                     {
5456                         mincount = 1;
5457                         maxcount = REG_INFTY;
5458                         next = regnext(scan);
5459                         scan = NEXTOPER(scan);
5460                         goto do_curly;
5461                     }
5462                 }
5463                 if (flags & SCF_DO_SUBSTR)
5464                     data->pos_min++;
5465                 min++;
5466                 /* FALLTHROUGH */
5467             case STAR:
5468                 next = NEXTOPER(scan);
5469
5470                 /* This temporary node can now be turned into EXACTFU, and
5471                  * must, as regexec.c doesn't handle it */
5472                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5473                     OP(next) = EXACTFU;
5474                 }
5475
5476                 if (     STR_LEN(next) == 1
5477                     &&   isALPHA_A(* STRING(next))
5478                     && (         OP(next) == EXACTFAA
5479                         || (     OP(next) == EXACTFU
5480                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5481                     &&   mutate_ok
5482                 ) {
5483                     /* These differ in just one bit */
5484                     U8 mask = ~ ('A' ^ 'a');
5485
5486                     assert(isALPHA_A(* STRING(next)));
5487
5488                     /* Then replace it by an ANYOFM node, with
5489                     * the mask set to the complement of the
5490                     * bit that differs between upper and lower
5491                     * case, and the lowest code point of the
5492                     * pair (which the '&' forces) */
5493                     OP(next) = ANYOFM;
5494                     ARG_SET(next, *STRING(next) & mask);
5495                     FLAGS(next) = mask;
5496                 }
5497
5498                 if (flags & SCF_DO_STCLASS) {
5499                     mincount = 0;
5500                     maxcount = REG_INFTY;
5501                     next = regnext(scan);
5502                     scan = NEXTOPER(scan);
5503                     goto do_curly;
5504                 }
5505                 if (flags & SCF_DO_SUBSTR) {
5506                     scan_commit(pRExC_state, data, minlenp, is_inf);
5507                     /* Cannot extend fixed substrings */
5508                     data->cur_is_floating = 1; /* float */
5509                 }
5510                 is_inf = is_inf_internal = 1;
5511                 scan = regnext(scan);
5512                 goto optimize_curly_tail;
5513             case CURLY:
5514                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5515                     && (scan->flags == stopparen))
5516                 {
5517                     mincount = 1;
5518                     maxcount = 1;
5519                 } else {
5520                     mincount = ARG1(scan);
5521                     maxcount = ARG2(scan);
5522                 }
5523                 next = regnext(scan);
5524                 if (OP(scan) == CURLYX) {
5525                     I32 lp = (data ? *(data->last_closep) : 0);
5526                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5527                 }
5528                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5529                 next_is_eval = (OP(scan) == EVAL);
5530               do_curly:
5531                 if (flags & SCF_DO_SUBSTR) {
5532                     if (mincount == 0)
5533                         scan_commit(pRExC_state, data, minlenp, is_inf);
5534                     /* Cannot extend fixed substrings */
5535                     pos_before = data->pos_min;
5536                 }
5537                 if (data) {
5538                     fl = data->flags;
5539                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5540                     if (is_inf)
5541                         data->flags |= SF_IS_INF;
5542                 }
5543                 if (flags & SCF_DO_STCLASS) {
5544                     ssc_init(pRExC_state, &this_class);
5545                     oclass = data->start_class;
5546                     data->start_class = &this_class;
5547                     f |= SCF_DO_STCLASS_AND;
5548                     f &= ~SCF_DO_STCLASS_OR;
5549                 }
5550                 /* Exclude from super-linear cache processing any {n,m}
5551                    regops for which the combination of input pos and regex
5552                    pos is not enough information to determine if a match
5553                    will be possible.
5554
5555                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5556                    regex pos at the \s*, the prospects for a match depend not
5557                    only on the input position but also on how many (bar\s*)
5558                    repeats into the {4,8} we are. */
5559                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5560                     f &= ~SCF_WHILEM_VISITED_POS;
5561
5562                 /* This will finish on WHILEM, setting scan, or on NULL: */
5563                 /* recurse study_chunk() on loop bodies */
5564                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5565                                   last, data, stopparen, recursed_depth, NULL,
5566                                   (mincount == 0
5567                                    ? (f & ~SCF_DO_SUBSTR)
5568                                    : f)
5569                                   , depth+1, mutate_ok);
5570
5571                 if (flags & SCF_DO_STCLASS)
5572                     data->start_class = oclass;
5573                 if (mincount == 0 || minnext == 0) {
5574                     if (flags & SCF_DO_STCLASS_OR) {
5575                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5576                     }
5577                     else if (flags & SCF_DO_STCLASS_AND) {
5578                         /* Switch to OR mode: cache the old value of
5579                          * data->start_class */
5580                         INIT_AND_WITHP;
5581                         StructCopy(data->start_class, and_withp, regnode_ssc);
5582                         flags &= ~SCF_DO_STCLASS_AND;
5583                         StructCopy(&this_class, data->start_class, regnode_ssc);
5584                         flags |= SCF_DO_STCLASS_OR;
5585                         ANYOF_FLAGS(data->start_class)
5586                                                 |= SSC_MATCHES_EMPTY_STRING;
5587                     }
5588                 } else {                /* Non-zero len */
5589                     if (flags & SCF_DO_STCLASS_OR) {
5590                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5591                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5592                     }
5593                     else if (flags & SCF_DO_STCLASS_AND)
5594                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5595                     flags &= ~SCF_DO_STCLASS;
5596                 }
5597                 if (!scan)              /* It was not CURLYX, but CURLY. */
5598                     scan = next;
5599                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5600                     /* ? quantifier ok, except for (?{ ... }) */
5601                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5602                     && (minnext == 0) && (deltanext == 0)
5603                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5604                     && maxcount <= REG_INFTY/3) /* Complement check for big
5605                                                    count */
5606                 {
5607                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5608                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5609                             "Quantifier unexpected on zero-length expression "
5610                             "in regex m/%" UTF8f "/",
5611                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5612                                   RExC_precomp)));
5613                 }
5614
5615                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5616                     || min >= SSize_t_MAX - minnext * mincount )
5617                 {
5618                     FAIL("Regexp out of space");
5619                 }
5620
5621                 min += minnext * mincount;
5622                 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5623                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5624                 is_inf |= is_inf_internal;
5625                 if (is_inf) {
5626                     delta = OPTIMIZE_INFTY;
5627                 } else {
5628                     delta += (minnext + deltanext) * maxcount
5629                              - minnext * mincount;
5630                 }
5631                 /* Try powerful optimization CURLYX => CURLYN. */
5632                 if (  OP(oscan) == CURLYX && data
5633                       && data->flags & SF_IN_PAR
5634                       && !(data->flags & SF_HAS_EVAL)
5635                       && !deltanext && minnext == 1
5636                       && mutate_ok
5637                 ) {
5638                     /* Try to optimize to CURLYN.  */
5639                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5640                     regnode * const nxt1 = nxt;
5641 #ifdef DEBUGGING
5642                     regnode *nxt2;
5643 #endif
5644
5645                     /* Skip open. */
5646                     nxt = regnext(nxt);
5647                     if (!REGNODE_SIMPLE(OP(nxt))
5648                         && !(PL_regkind[OP(nxt)] == EXACT
5649                              && STR_LEN(nxt) == 1))
5650                         goto nogo;
5651 #ifdef DEBUGGING
5652                     nxt2 = nxt;
5653 #endif
5654                     nxt = regnext(nxt);
5655                     if (OP(nxt) != CLOSE)
5656                         goto nogo;
5657                     if (RExC_open_parens) {
5658
5659                         /*open->CURLYM*/
5660                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5661
5662                         /*close->while*/
5663                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5664                     }
5665                     /* Now we know that nxt2 is the only contents: */
5666                     oscan->flags = (U8)ARG(nxt);
5667                     OP(oscan) = CURLYN;
5668                     OP(nxt1) = NOTHING; /* was OPEN. */
5669
5670 #ifdef DEBUGGING
5671                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5672                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5673                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5674                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5675                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5676                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5677 #endif
5678                 }
5679               nogo:
5680
5681                 /* Try optimization CURLYX => CURLYM. */
5682                 if (  OP(oscan) == CURLYX && data
5683                       && !(data->flags & SF_HAS_PAR)
5684                       && !(data->flags & SF_HAS_EVAL)
5685                       && !deltanext     /* atom is fixed width */
5686                       && minnext != 0   /* CURLYM can't handle zero width */
5687                          /* Nor characters whose fold at run-time may be
5688                           * multi-character */
5689                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5690                       && mutate_ok
5691                 ) {
5692                     /* XXXX How to optimize if data == 0? */
5693                     /* Optimize to a simpler form.  */
5694                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5695                     regnode *nxt2;
5696
5697                     OP(oscan) = CURLYM;
5698                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5699                             && (OP(nxt2) != WHILEM))
5700                         nxt = nxt2;
5701                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5702                     /* Need to optimize away parenths. */
5703                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5704                         /* Set the parenth number.  */
5705                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5706
5707                         oscan->flags = (U8)ARG(nxt);
5708                         if (RExC_open_parens) {
5709                              /*open->CURLYM*/
5710                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5711
5712                             /*close->NOTHING*/
5713                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5714                                                          + 1;
5715                         }
5716                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5717                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5718
5719 #ifdef DEBUGGING
5720                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5721                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5722                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5723                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5724 #endif
5725 #if 0
5726                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5727                             regnode *nnxt = regnext(nxt1);
5728                             if (nnxt == nxt) {
5729                                 if (reg_off_by_arg[OP(nxt1)])
5730                                     ARG_SET(nxt1, nxt2 - nxt1);
5731                                 else if (nxt2 - nxt1 < U16_MAX)
5732                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5733                                 else
5734                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5735                             }
5736                             nxt1 = nnxt;
5737                         }
5738 #endif
5739                         /* Optimize again: */
5740                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5741                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5742                                     NULL, stopparen, recursed_depth, NULL, 0,
5743                                     depth+1, mutate_ok);
5744                     }
5745                     else
5746                         oscan->flags = 0;
5747                 }
5748                 else if ((OP(oscan) == CURLYX)
5749                          && (flags & SCF_WHILEM_VISITED_POS)
5750                          /* See the comment on a similar expression above.
5751                             However, this time it's not a subexpression
5752                             we care about, but the expression itself. */
5753                          && (maxcount == REG_INFTY)
5754                          && data) {
5755                     /* This stays as CURLYX, we can put the count/of pair. */
5756                     /* Find WHILEM (as in regexec.c) */
5757                     regnode *nxt = oscan + NEXT_OFF(oscan);
5758
5759                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5760                         nxt += ARG(nxt);
5761                     nxt = PREVOPER(nxt);
5762                     if (nxt->flags & 0xf) {
5763                         /* we've already set whilem count on this node */
5764                     } else if (++data->whilem_c < 16) {
5765                         assert(data->whilem_c <= RExC_whilem_seen);
5766                         nxt->flags = (U8)(data->whilem_c
5767                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5768                     }
5769                 }
5770                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5771                     pars++;
5772                 if (flags & SCF_DO_SUBSTR) {
5773                     SV *last_str = NULL;
5774                     STRLEN last_chrs = 0;
5775                     int counted = mincount != 0;
5776
5777                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5778                                                                   string. */
5779                         SSize_t b = pos_before >= data->last_start_min
5780                             ? pos_before : data->last_start_min;
5781                         STRLEN l;
5782                         const char * const s = SvPV_const(data->last_found, l);
5783                         SSize_t old = b - data->last_start_min;
5784                         assert(old >= 0);
5785
5786                         if (UTF)
5787                             old = utf8_hop_forward((U8*)s, old,
5788                                                (U8 *) SvEND(data->last_found))
5789                                 - (U8*)s;
5790                         l -= old;
5791                         /* Get the added string: */
5792                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5793                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5794                                             (U8*)(s + old + l)) : l;
5795                         if (deltanext == 0 && pos_before == b) {
5796                             /* What was added is a constant string */
5797                             if (mincount > 1) {
5798
5799                                 SvGROW(last_str, (mincount * l) + 1);
5800                                 repeatcpy(SvPVX(last_str) + l,
5801                                           SvPVX_const(last_str), l,
5802                                           mincount - 1);
5803                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5804                                 /* Add additional parts. */
5805                                 SvCUR_set(data->last_found,
5806                                           SvCUR(data->last_found) - l);
5807                                 sv_catsv(data->last_found, last_str);
5808                                 {
5809                                     SV * sv = data->last_found;
5810                                     MAGIC *mg =
5811                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5812                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5813                                     if (mg && mg->mg_len >= 0)
5814                                         mg->mg_len += last_chrs * (mincount-1);
5815                                 }
5816                                 last_chrs *= mincount;
5817                                 data->last_end += l * (mincount - 1);
5818                             }
5819                         } else {
5820                             /* start offset must point into the last copy */
5821                             data->last_start_min += minnext * (mincount - 1);
5822                             data->last_start_max =
5823                               is_inf
5824                                ? OPTIMIZE_INFTY
5825                                : data->last_start_max +
5826                                  (maxcount - 1) * (minnext + data->pos_delta);
5827                         }
5828                     }
5829                     /* It is counted once already... */
5830                     data->pos_min += minnext * (mincount - counted);
5831 #if 0
5832 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5833                               " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5834                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5835     (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5836     (UV)mincount);
5837 if (deltanext != OPTIMIZE_INFTY)
5838 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5839     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5840           - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5841 #endif
5842                     if (deltanext == OPTIMIZE_INFTY
5843                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5844                         data->pos_delta = OPTIMIZE_INFTY;
5845                     else
5846                         data->pos_delta += - counted * deltanext +
5847                         (minnext + deltanext) * maxcount - minnext * mincount;
5848                     if (mincount != maxcount) {
5849                          /* Cannot extend fixed substrings found inside
5850                             the group.  */
5851                         scan_commit(pRExC_state, data, minlenp, is_inf);
5852                         if (mincount && last_str) {
5853                             SV * const sv = data->last_found;
5854                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5855                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5856
5857                             if (mg)
5858                                 mg->mg_len = -1;
5859                             sv_setsv(sv, last_str);
5860                             data->last_end = data->pos_min;
5861                             data->last_start_min = data->pos_min - last_chrs;
5862                             data->last_start_max = is_inf
5863                                 ? OPTIMIZE_INFTY
5864                                 : data->pos_min + data->pos_delta - last_chrs;
5865                         }
5866                         data->cur_is_floating = 1; /* float */
5867                     }
5868                     SvREFCNT_dec(last_str);
5869                 }
5870                 if (data && (fl & SF_HAS_EVAL))
5871                     data->flags |= SF_HAS_EVAL;
5872               optimize_curly_tail:
5873                 rck_elide_nothing(oscan);
5874                 continue;
5875
5876             default:
5877                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5878                                                                     OP(scan));
5879             case REF:
5880             case CLUMP:
5881                 if (flags & SCF_DO_SUBSTR) {
5882                     /* Cannot expect anything... */
5883                     scan_commit(pRExC_state, data, minlenp, is_inf);
5884                     data->cur_is_floating = 1; /* float */
5885                 }
5886                 is_inf = is_inf_internal = 1;
5887                 if (flags & SCF_DO_STCLASS_OR) {
5888                     if (OP(scan) == CLUMP) {
5889                         /* Actually is any start char, but very few code points
5890                          * aren't start characters */
5891                         ssc_match_all_cp(data->start_class);
5892                     }
5893                     else {
5894                         ssc_anything(data->start_class);
5895                     }
5896                 }
5897                 flags &= ~SCF_DO_STCLASS;
5898                 break;
5899             }
5900         }
5901         else if (OP(scan) == LNBREAK) {
5902             if (flags & SCF_DO_STCLASS) {
5903                 if (flags & SCF_DO_STCLASS_AND) {
5904                     ssc_intersection(data->start_class,
5905                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5906                     ssc_clear_locale(data->start_class);
5907                     ANYOF_FLAGS(data->start_class)
5908                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5909                 }
5910                 else if (flags & SCF_DO_STCLASS_OR) {
5911                     ssc_union(data->start_class,
5912                               PL_XPosix_ptrs[_CC_VERTSPACE],
5913                               FALSE);
5914                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5915
5916                     /* See commit msg for
5917                      * 749e076fceedeb708a624933726e7989f2302f6a */
5918                     ANYOF_FLAGS(data->start_class)
5919                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5920                 }
5921                 flags &= ~SCF_DO_STCLASS;
5922             }
5923             min++;
5924             if (delta != OPTIMIZE_INFTY)
5925                 delta++;    /* Because of the 2 char string cr-lf */
5926             if (flags & SCF_DO_SUBSTR) {
5927                 /* Cannot expect anything... */
5928                 scan_commit(pRExC_state, data, minlenp, is_inf);
5929                 data->pos_min += 1;
5930                 if (data->pos_delta != OPTIMIZE_INFTY) {
5931                     data->pos_delta += 1;
5932                 }
5933                 data->cur_is_floating = 1; /* float */
5934             }
5935         }
5936         else if (REGNODE_SIMPLE(OP(scan))) {
5937
5938             if (flags & SCF_DO_SUBSTR) {
5939                 scan_commit(pRExC_state, data, minlenp, is_inf);
5940                 data->pos_min++;
5941             }
5942             min++;
5943             if (flags & SCF_DO_STCLASS) {
5944                 bool invert = 0;
5945                 SV* my_invlist = NULL;
5946                 U8 namedclass;
5947
5948                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5949                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5950
5951                 /* Some of the logic below assumes that switching
5952                    locale on will only add false positives. */
5953                 switch (OP(scan)) {
5954
5955                 default:
5956 #ifdef DEBUGGING
5957                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5958                                                                      OP(scan));
5959 #endif
5960                 case SANY:
5961                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5962                         ssc_match_all_cp(data->start_class);
5963                     break;
5964
5965                 case REG_ANY:
5966                     {
5967                         SV* REG_ANY_invlist = _new_invlist(2);
5968                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5969                                                             '\n');
5970                         if (flags & SCF_DO_STCLASS_OR) {
5971                             ssc_union(data->start_class,
5972                                       REG_ANY_invlist,
5973                                       TRUE /* TRUE => invert, hence all but \n
5974                                             */
5975                                       );
5976                         }
5977                         else if (flags & SCF_DO_STCLASS_AND) {
5978                             ssc_intersection(data->start_class,
5979                                              REG_ANY_invlist,
5980                                              TRUE  /* TRUE => invert */
5981                                              );
5982                             ssc_clear_locale(data->start_class);
5983                         }
5984                         SvREFCNT_dec_NN(REG_ANY_invlist);
5985                     }
5986                     break;
5987
5988                 case ANYOFD:
5989                 case ANYOFL:
5990                 case ANYOFPOSIXL:
5991                 case ANYOFH:
5992                 case ANYOFHb:
5993                 case ANYOFHr:
5994                 case ANYOFHs:
5995                 case ANYOF:
5996                     if (flags & SCF_DO_STCLASS_AND)
5997                         ssc_and(pRExC_state, data->start_class,
5998                                 (regnode_charclass *) scan);
5999                     else
6000                         ssc_or(pRExC_state, data->start_class,
6001                                                           (regnode_charclass *) scan);
6002                     break;
6003
6004                 case NANYOFM: /* NANYOFM already contains the inversion of the
6005                                  input ANYOF data, so, unlike things like
6006                                  NPOSIXA, don't change 'invert' to TRUE */
6007                     /* FALLTHROUGH */
6008                 case ANYOFM:
6009                   {
6010                     SV* cp_list = get_ANYOFM_contents(scan);
6011
6012                     if (flags & SCF_DO_STCLASS_OR) {
6013                         ssc_union(data->start_class, cp_list, invert);
6014                     }
6015                     else if (flags & SCF_DO_STCLASS_AND) {
6016                         ssc_intersection(data->start_class, cp_list, invert);
6017                     }
6018
6019                     SvREFCNT_dec_NN(cp_list);
6020                     break;
6021                   }
6022
6023                 case ANYOFR:
6024                 case ANYOFRb:
6025                   {
6026                     SV* cp_list = NULL;
6027
6028                     cp_list = _add_range_to_invlist(cp_list,
6029                                         ANYOFRbase(scan),
6030                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
6031
6032                     if (flags & SCF_DO_STCLASS_OR) {
6033                         ssc_union(data->start_class, cp_list, invert);
6034                     }
6035                     else if (flags & SCF_DO_STCLASS_AND) {
6036                         ssc_intersection(data->start_class, cp_list, invert);
6037                     }
6038
6039                     SvREFCNT_dec_NN(cp_list);
6040                     break;
6041                   }
6042
6043                 case NPOSIXL:
6044                     invert = 1;
6045                     /* FALLTHROUGH */
6046
6047                 case POSIXL:
6048                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6049                     if (flags & SCF_DO_STCLASS_AND) {
6050                         bool was_there = cBOOL(
6051                                           ANYOF_POSIXL_TEST(data->start_class,
6052                                                                  namedclass));
6053                         ANYOF_POSIXL_ZERO(data->start_class);
6054                         if (was_there) {    /* Do an AND */
6055                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6056                         }
6057                         /* No individual code points can now match */
6058                         data->start_class->invlist
6059                                                 = sv_2mortal(_new_invlist(0));
6060                     }
6061                     else {
6062                         int complement = namedclass + ((invert) ? -1 : 1);
6063
6064                         assert(flags & SCF_DO_STCLASS_OR);
6065
6066                         /* If the complement of this class was already there,
6067                          * the result is that they match all code points,
6068                          * (\d + \D == everything).  Remove the classes from
6069                          * future consideration.  Locale is not relevant in
6070                          * this case */
6071                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6072                             ssc_match_all_cp(data->start_class);
6073                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6074                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
6075                         }
6076                         else {  /* The usual case; just add this class to the
6077                                    existing set */
6078                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6079                         }
6080                     }
6081                     break;
6082
6083                 case NPOSIXA:   /* For these, we always know the exact set of
6084                                    what's matched */
6085                     invert = 1;
6086                     /* FALLTHROUGH */
6087                 case POSIXA:
6088                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6089                     goto join_posix_and_ascii;
6090
6091                 case NPOSIXD:
6092                 case NPOSIXU:
6093                     invert = 1;
6094                     /* FALLTHROUGH */
6095                 case POSIXD:
6096                 case POSIXU:
6097                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6098
6099                     /* NPOSIXD matches all upper Latin1 code points unless the
6100                      * target string being matched is UTF-8, which is
6101                      * unknowable until match time.  Since we are going to
6102                      * invert, we want to get rid of all of them so that the
6103                      * inversion will match all */
6104                     if (OP(scan) == NPOSIXD) {
6105                         _invlist_subtract(my_invlist, PL_UpperLatin1,
6106                                           &my_invlist);
6107                     }
6108
6109                   join_posix_and_ascii:
6110
6111                     if (flags & SCF_DO_STCLASS_AND) {
6112                         ssc_intersection(data->start_class, my_invlist, invert);
6113                         ssc_clear_locale(data->start_class);
6114                     }
6115                     else {
6116                         assert(flags & SCF_DO_STCLASS_OR);
6117                         ssc_union(data->start_class, my_invlist, invert);
6118                     }
6119                     SvREFCNT_dec(my_invlist);
6120                 }
6121                 if (flags & SCF_DO_STCLASS_OR)
6122                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6123                 flags &= ~SCF_DO_STCLASS;
6124             }
6125         }
6126         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6127             data->flags |= (OP(scan) == MEOL
6128                             ? SF_BEFORE_MEOL
6129                             : SF_BEFORE_SEOL);
6130             scan_commit(pRExC_state, data, minlenp, is_inf);
6131
6132         }
6133         else if (  PL_regkind[OP(scan)] == BRANCHJ
6134                  /* Lookbehind, or need to calculate parens/evals/stclass: */
6135                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
6136                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6137         {
6138             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6139                 || OP(scan) == UNLESSM )
6140             {
6141                 /* Negative Lookahead/lookbehind
6142                    In this case we can't do fixed string optimisation.
6143                 */
6144
6145                 SSize_t deltanext, minnext, fake = 0;
6146                 regnode *nscan;
6147                 regnode_ssc intrnl;
6148                 int f = 0;
6149
6150                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6151                 if (data) {
6152                     data_fake.whilem_c = data->whilem_c;
6153                     data_fake.last_closep = data->last_closep;
6154                 }
6155                 else
6156                     data_fake.last_closep = &fake;
6157                 data_fake.pos_delta = delta;
6158                 if ( flags & SCF_DO_STCLASS && !scan->flags
6159                      && OP(scan) == IFMATCH ) { /* Lookahead */
6160                     ssc_init(pRExC_state, &intrnl);
6161                     data_fake.start_class = &intrnl;
6162                     f |= SCF_DO_STCLASS_AND;
6163                 }
6164                 if (flags & SCF_WHILEM_VISITED_POS)
6165                     f |= SCF_WHILEM_VISITED_POS;
6166                 next = regnext(scan);
6167                 nscan = NEXTOPER(NEXTOPER(scan));
6168
6169                 /* recurse study_chunk() for lookahead body */
6170                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6171                                       last, &data_fake, stopparen,
6172                                       recursed_depth, NULL, f, depth+1,
6173                                       mutate_ok);
6174                 if (scan->flags) {
6175                     if (   deltanext < 0
6176                         || deltanext > (I32) U8_MAX
6177                         || minnext > (I32)U8_MAX
6178                         || minnext + deltanext > (I32)U8_MAX)
6179                     {
6180                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6181                               (UV)U8_MAX);
6182                     }
6183
6184                     /* The 'next_off' field has been repurposed to count the
6185                      * additional starting positions to try beyond the initial
6186                      * one.  (This leaves it at 0 for non-variable length
6187                      * matches to avoid breakage for those not using this
6188                      * extension) */
6189                     if (deltanext) {
6190                         scan->next_off = deltanext;
6191                         ckWARNexperimental(RExC_parse,
6192                             WARN_EXPERIMENTAL__VLB,
6193                             "Variable length lookbehind is experimental");
6194                     }
6195                     scan->flags = (U8)minnext + deltanext;
6196                 }
6197                 if (data) {
6198                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6199                         pars++;
6200                     if (data_fake.flags & SF_HAS_EVAL)
6201                         data->flags |= SF_HAS_EVAL;
6202                     data->whilem_c = data_fake.whilem_c;
6203                 }
6204                 if (f & SCF_DO_STCLASS_AND) {
6205                     if (flags & SCF_DO_STCLASS_OR) {
6206                         /* OR before, AND after: ideally we would recurse with
6207                          * data_fake to get the AND applied by study of the
6208                          * remainder of the pattern, and then derecurse;
6209                          * *** HACK *** for now just treat as "no information".
6210                          * See [perl #56690].
6211                          */
6212                         ssc_init(pRExC_state, data->start_class);
6213                     }  else {
6214                         /* AND before and after: combine and continue.  These
6215                          * assertions are zero-length, so can match an EMPTY
6216                          * string */
6217                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6218                         ANYOF_FLAGS(data->start_class)
6219                                                    |= SSC_MATCHES_EMPTY_STRING;
6220                     }
6221                 }
6222             }
6223 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6224             else {
6225                 /* Positive Lookahead/lookbehind
6226                    In this case we can do fixed string optimisation,
6227                    but we must be careful about it. Note in the case of
6228                    lookbehind the positions will be offset by the minimum
6229                    length of the pattern, something we won't know about
6230                    until after the recurse.
6231                 */
6232                 SSize_t deltanext, fake = 0;
6233                 regnode *nscan;
6234                 regnode_ssc intrnl;
6235                 int f = 0;
6236                 /* We use SAVEFREEPV so that when the full compile
6237                     is finished perl will clean up the allocated
6238                     minlens when it's all done. This way we don't
6239                     have to worry about freeing them when we know
6240                     they wont be used, which would be a pain.
6241                  */
6242                 SSize_t *minnextp;
6243                 Newx( minnextp, 1, SSize_t );
6244                 SAVEFREEPV(minnextp);
6245
6246                 if (data) {
6247                     StructCopy(data, &data_fake, scan_data_t);
6248                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6249                         f |= SCF_DO_SUBSTR;
6250                         if (scan->flags)
6251                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6252                         data_fake.last_found=newSVsv(data->last_found);
6253                     }
6254                 }
6255                 else
6256                     data_fake.last_closep = &fake;
6257                 data_fake.flags = 0;
6258                 data_fake.substrs[0].flags = 0;
6259                 data_fake.substrs[1].flags = 0;
6260                 data_fake.pos_delta = delta;
6261                 if (is_inf)
6262                     data_fake.flags |= SF_IS_INF;
6263                 if ( flags & SCF_DO_STCLASS && !scan->flags
6264                      && OP(scan) == IFMATCH ) { /* Lookahead */
6265                     ssc_init(pRExC_state, &intrnl);
6266                     data_fake.start_class = &intrnl;
6267                     f |= SCF_DO_STCLASS_AND;
6268                 }
6269                 if (flags & SCF_WHILEM_VISITED_POS)
6270                     f |= SCF_WHILEM_VISITED_POS;
6271                 next = regnext(scan);
6272                 nscan = NEXTOPER(NEXTOPER(scan));
6273
6274                 /* positive lookahead study_chunk() recursion */
6275                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6276                                         &deltanext, last, &data_fake,
6277                                         stopparen, recursed_depth, NULL,
6278                                         f, depth+1, mutate_ok);
6279                 if (scan->flags) {
6280                     assert(0);  /* This code has never been tested since this
6281                                    is normally not compiled */
6282                     if (   deltanext < 0
6283                         || deltanext > (I32) U8_MAX
6284                         || *minnextp > (I32)U8_MAX
6285                         || *minnextp + deltanext > (I32)U8_MAX)
6286                     {
6287                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6288                               (UV)U8_MAX);
6289                     }
6290
6291                     if (deltanext) {
6292                         scan->next_off = deltanext;
6293                     }
6294                     scan->flags = (U8)*minnextp + deltanext;
6295                 }
6296
6297                 *minnextp += min;
6298
6299                 if (f & SCF_DO_STCLASS_AND) {
6300                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6301                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6302                 }
6303                 if (data) {
6304                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6305                         pars++;
6306                     if (data_fake.flags & SF_HAS_EVAL)
6307                         data->flags |= SF_HAS_EVAL;
6308                     data->whilem_c = data_fake.whilem_c;
6309                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6310                         int i;
6311                         if (RExC_rx->minlen<*minnextp)
6312                             RExC_rx->minlen=*minnextp;
6313                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6314                         SvREFCNT_dec_NN(data_fake.last_found);
6315
6316                         for (i = 0; i < 2; i++) {
6317                             if (data_fake.substrs[i].minlenp != minlenp) {
6318                                 data->substrs[i].min_offset =
6319                                             data_fake.substrs[i].min_offset;
6320                                 data->substrs[i].max_offset =
6321                                             data_fake.substrs[i].max_offset;
6322                                 data->substrs[i].minlenp =
6323                                             data_fake.substrs[i].minlenp;
6324                                 data->substrs[i].lookbehind += scan->flags;
6325                             }
6326                         }
6327                     }
6328                 }
6329             }
6330 #endif
6331         }
6332         else if (OP(scan) == OPEN) {
6333             if (stopparen != (I32)ARG(scan))
6334                 pars++;
6335         }
6336         else if (OP(scan) == CLOSE) {
6337             if (stopparen == (I32)ARG(scan)) {
6338                 break;
6339             }
6340             if ((I32)ARG(scan) == is_par) {
6341                 next = regnext(scan);
6342
6343                 if ( next && (OP(next) != WHILEM) && next < last)
6344                     is_par = 0;         /* Disable optimization */
6345             }
6346             if (data)
6347                 *(data->last_closep) = ARG(scan);
6348         }
6349         else if (OP(scan) == EVAL) {
6350                 if (data)
6351                     data->flags |= SF_HAS_EVAL;
6352         }
6353         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6354             if (flags & SCF_DO_SUBSTR) {
6355                 scan_commit(pRExC_state, data, minlenp, is_inf);
6356                 flags &= ~SCF_DO_SUBSTR;
6357             }
6358             if (data && OP(scan)==ACCEPT) {
6359                 data->flags |= SCF_SEEN_ACCEPT;
6360                 if (stopmin > min)
6361                     stopmin = min;
6362             }
6363         }
6364         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6365         {
6366                 if (flags & SCF_DO_SUBSTR) {
6367                     scan_commit(pRExC_state, data, minlenp, is_inf);
6368                     data->cur_is_floating = 1; /* float */
6369                 }
6370                 is_inf = is_inf_internal = 1;
6371                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6372                     ssc_anything(data->start_class);
6373                 flags &= ~SCF_DO_STCLASS;
6374         }
6375         else if (OP(scan) == GPOS) {
6376             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6377                 !(delta || is_inf || (data && data->pos_delta)))
6378             {
6379                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6380                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6381                 if (RExC_rx->gofs < (STRLEN)min)
6382                     RExC_rx->gofs = min;
6383             } else {
6384                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6385                 RExC_rx->gofs = 0;
6386             }
6387         }
6388 #ifdef TRIE_STUDY_OPT
6389 #ifdef FULL_TRIE_STUDY
6390         else if (PL_regkind[OP(scan)] == TRIE) {
6391             /* NOTE - There is similar code to this block above for handling
6392                BRANCH nodes on the initial study.  If you change stuff here
6393                check there too. */
6394             regnode *trie_node= scan;
6395             regnode *tail= regnext(scan);
6396             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6397             SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6398             regnode_ssc accum;
6399
6400             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6401                 /* Cannot merge strings after this. */
6402                 scan_commit(pRExC_state, data, minlenp, is_inf);
6403             }
6404             if (flags & SCF_DO_STCLASS)
6405                 ssc_init_zero(pRExC_state, &accum);
6406
6407             if (!trie->jump) {
6408                 min1= trie->minlen;
6409                 max1= trie->maxlen;
6410             } else {
6411                 const regnode *nextbranch= NULL;
6412                 U32 word;
6413
6414                 for ( word=1 ; word <= trie->wordcount ; word++)
6415                 {
6416                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6417                     regnode_ssc this_class;
6418
6419                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6420                     if (data) {
6421                         data_fake.whilem_c = data->whilem_c;
6422                         data_fake.last_closep = data->last_closep;
6423                     }
6424                     else
6425                         data_fake.last_closep = &fake;
6426                     data_fake.pos_delta = delta;
6427                     if (flags & SCF_DO_STCLASS) {
6428                         ssc_init(pRExC_state, &this_class);
6429                         data_fake.start_class = &this_class;
6430                         f = SCF_DO_STCLASS_AND;
6431                     }
6432                     if (flags & SCF_WHILEM_VISITED_POS)
6433                         f |= SCF_WHILEM_VISITED_POS;
6434
6435                     if (trie->jump[word]) {
6436                         if (!nextbranch)
6437                             nextbranch = trie_node + trie->jump[0];
6438                         scan= trie_node + trie->jump[word];
6439                         /* We go from the jump point to the branch that follows
6440                            it. Note this means we need the vestigal unused
6441                            branches even though they arent otherwise used. */
6442                         /* optimise study_chunk() for TRIE */
6443                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6444                             &deltanext, (regnode *)nextbranch, &data_fake,
6445                             stopparen, recursed_depth, NULL, f, depth+1,
6446                             mutate_ok);
6447                     }
6448                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6449                         nextbranch= regnext((regnode*)nextbranch);
6450
6451                     if (min1 > (SSize_t)(minnext + trie->minlen))
6452                         min1 = minnext + trie->minlen;
6453                     if (deltanext == OPTIMIZE_INFTY) {
6454                         is_inf = is_inf_internal = 1;
6455                         max1 = OPTIMIZE_INFTY;
6456                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6457                         max1 = minnext + deltanext + trie->maxlen;
6458
6459                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6460                         pars++;
6461                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6462                         if ( stopmin > min + min1)
6463                             stopmin = min + min1;
6464                         flags &= ~SCF_DO_SUBSTR;
6465                         if (data)
6466                             data->flags |= SCF_SEEN_ACCEPT;
6467                     }
6468                     if (data) {
6469                         if (data_fake.flags & SF_HAS_EVAL)
6470                             data->flags |= SF_HAS_EVAL;
6471                         data->whilem_c = data_fake.whilem_c;
6472                     }
6473                     if (flags & SCF_DO_STCLASS)
6474                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6475                 }
6476             }
6477             if (flags & SCF_DO_SUBSTR) {
6478                 data->pos_min += min1;
6479                 data->pos_delta += max1 - min1;
6480                 if (max1 != min1 || is_inf)
6481                     data->cur_is_floating = 1; /* float */
6482             }
6483             min += min1;
6484             if (delta != OPTIMIZE_INFTY) {
6485                 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6486                     delta += max1 - min1;
6487                 else
6488                     delta = OPTIMIZE_INFTY;
6489             }
6490             if (flags & SCF_DO_STCLASS_OR) {
6491                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6492                 if (min1) {
6493                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6494                     flags &= ~SCF_DO_STCLASS;
6495                 }
6496             }
6497             else if (flags & SCF_DO_STCLASS_AND) {
6498                 if (min1) {
6499                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6500                     flags &= ~SCF_DO_STCLASS;
6501                 }
6502                 else {
6503                     /* Switch to OR mode: cache the old value of
6504                      * data->start_class */
6505                     INIT_AND_WITHP;
6506                     StructCopy(data->start_class, and_withp, regnode_ssc);
6507                     flags &= ~SCF_DO_STCLASS_AND;
6508                     StructCopy(&accum, data->start_class, regnode_ssc);
6509                     flags |= SCF_DO_STCLASS_OR;
6510                 }
6511             }
6512             scan= tail;
6513             continue;
6514         }
6515 #else
6516         else if (PL_regkind[OP(scan)] == TRIE) {
6517             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6518             U8*bang=NULL;
6519
6520             min += trie->minlen;
6521             delta += (trie->maxlen - trie->minlen);
6522             flags &= ~SCF_DO_STCLASS; /* xxx */
6523             if (flags & SCF_DO_SUBSTR) {
6524                 /* Cannot expect anything... */
6525                 scan_commit(pRExC_state, data, minlenp, is_inf);
6526                 data->pos_min += trie->minlen;
6527                 data->pos_delta += (trie->maxlen - trie->minlen);
6528                 if (trie->maxlen != trie->minlen)
6529                     data->cur_is_floating = 1; /* float */
6530             }
6531             if (trie->jump) /* no more substrings -- for now /grr*/
6532                flags &= ~SCF_DO_SUBSTR;
6533         }
6534         else if (OP(scan) == REGEX_SET) {
6535             Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6536                              " before optimization", reg_name[REGEX_SET]);
6537         }
6538
6539 #endif /* old or new */
6540 #endif /* TRIE_STUDY_OPT */
6541
6542         /* Else: zero-length, ignore. */
6543         scan = regnext(scan);
6544     }
6545
6546   finish:
6547     if (frame) {
6548         /* we need to unwind recursion. */
6549         depth = depth - 1;
6550
6551         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6552         DEBUG_PEEP("fend", scan, depth, flags);
6553
6554         /* restore previous context */
6555         last = frame->last_regnode;
6556         scan = frame->next_regnode;
6557         stopparen = frame->stopparen;
6558         recursed_depth = frame->prev_recursed_depth;
6559
6560         RExC_frame_last = frame->prev_frame;
6561         frame = frame->this_prev_frame;
6562         goto fake_study_recurse;
6563     }
6564
6565     assert(!frame);
6566     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6567
6568     *scanp = scan;
6569     *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6570
6571     if (flags & SCF_DO_SUBSTR && is_inf)
6572         data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6573     if (is_par > (I32)U8_MAX)
6574         is_par = 0;
6575     if (is_par && pars==1 && data) {
6576         data->flags |= SF_IN_PAR;
6577         data->flags &= ~SF_HAS_PAR;
6578     }
6579     else if (pars && data) {
6580         data->flags |= SF_HAS_PAR;
6581         data->flags &= ~SF_IN_PAR;
6582     }
6583     if (flags & SCF_DO_STCLASS_OR)
6584         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6585     if (flags & SCF_TRIE_RESTUDY)
6586         data->flags |=  SCF_TRIE_RESTUDY;
6587
6588     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6589
6590     final_minlen = min < stopmin
6591             ? min : stopmin;
6592
6593     if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6594         if (final_minlen > OPTIMIZE_INFTY - delta)
6595             RExC_maxlen = OPTIMIZE_INFTY;
6596         else if (RExC_maxlen < final_minlen + delta)
6597             RExC_maxlen = final_minlen + delta;
6598     }
6599     return final_minlen;
6600 }
6601
6602 STATIC U32
6603 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6604 {
6605     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6606
6607     PERL_ARGS_ASSERT_ADD_DATA;
6608
6609     Renewc(RExC_rxi->data,
6610            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6611            char, struct reg_data);
6612     if(count)
6613         Renew(RExC_rxi->data->what, count + n, U8);
6614     else
6615         Newx(RExC_rxi->data->what, n, U8);
6616     RExC_rxi->data->count = count + n;
6617     Copy(s, RExC_rxi->data->what + count, n, U8);
6618     return count;
6619 }
6620
6621 /*XXX: todo make this not included in a non debugging perl, but appears to be
6622  * used anyway there, in 'use re' */
6623 #ifndef PERL_IN_XSUB_RE
6624 void
6625 Perl_reginitcolors(pTHX)
6626 {
6627     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6628     if (s) {
6629         char *t = savepv(s);
6630         int i = 0;
6631         PL_colors[0] = t;
6632         while (++i < 6) {
6633             t = strchr(t, '\t');
6634             if (t) {
6635                 *t = '\0';
6636                 PL_colors[i] = ++t;
6637             }
6638             else
6639                 PL_colors[i] = t = (char *)"";
6640         }
6641     } else {
6642         int i = 0;
6643         while (i < 6)
6644             PL_colors[i++] = (char *)"";
6645     }
6646     PL_colorset = 1;
6647 }
6648 #endif
6649
6650
6651 #ifdef TRIE_STUDY_OPT
6652 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6653     STMT_START {                                            \
6654         if (                                                \
6655               (data.flags & SCF_TRIE_RESTUDY)               \
6656               && ! restudied++                              \
6657         ) {                                                 \
6658             dOsomething;                                    \
6659             goto reStudy;                                   \
6660         }                                                   \
6661     } STMT_END
6662 #else
6663 #define CHECK_RESTUDY_GOTO_butfirst
6664 #endif
6665
6666 /*
6667  * pregcomp - compile a regular expression into internal code
6668  *
6669  * Decides which engine's compiler to call based on the hint currently in
6670  * scope
6671  */
6672
6673 #ifndef PERL_IN_XSUB_RE
6674
6675 /* return the currently in-scope regex engine (or the default if none)  */
6676
6677 regexp_engine const *
6678 Perl_current_re_engine(pTHX)
6679 {
6680     if (IN_PERL_COMPILETIME) {
6681         HV * const table = GvHV(PL_hintgv);
6682         SV **ptr;
6683
6684         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6685             return &PL_core_reg_engine;
6686         ptr = hv_fetchs(table, "regcomp", FALSE);
6687         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6688             return &PL_core_reg_engine;
6689         return INT2PTR(regexp_engine*, SvIV(*ptr));
6690     }
6691     else {
6692         SV *ptr;
6693         if (!PL_curcop->cop_hints_hash)
6694             return &PL_core_reg_engine;
6695         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6696         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6697             return &PL_core_reg_engine;
6698         return INT2PTR(regexp_engine*, SvIV(ptr));
6699     }
6700 }
6701
6702
6703 REGEXP *
6704 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6705 {
6706     regexp_engine const *eng = current_re_engine();
6707     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6708
6709     PERL_ARGS_ASSERT_PREGCOMP;
6710
6711     /* Dispatch a request to compile a regexp to correct regexp engine. */
6712     DEBUG_COMPILE_r({
6713         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6714                         PTR2UV(eng));
6715     });
6716     return CALLREGCOMP_ENG(eng, pattern, flags);
6717 }
6718 #endif
6719
6720 /* public(ish) entry point for the perl core's own regex compiling code.
6721  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6722  * pattern rather than a list of OPs, and uses the internal engine rather
6723  * than the current one */
6724
6725 REGEXP *
6726 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6727 {
6728     SV *pat = pattern; /* defeat constness! */
6729
6730     PERL_ARGS_ASSERT_RE_COMPILE;
6731
6732     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6733 #ifdef PERL_IN_XSUB_RE
6734                                 &my_reg_engine,
6735 #else
6736                                 &PL_core_reg_engine,
6737 #endif
6738                                 NULL, NULL, rx_flags, 0);
6739 }
6740
6741 static void
6742 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6743 {
6744     int n;
6745
6746     if (--cbs->refcnt > 0)
6747         return;
6748     for (n = 0; n < cbs->count; n++) {
6749         REGEXP *rx = cbs->cb[n].src_regex;
6750         if (rx) {
6751             cbs->cb[n].src_regex = NULL;
6752             SvREFCNT_dec_NN(rx);
6753         }
6754     }
6755     Safefree(cbs->cb);
6756     Safefree(cbs);
6757 }
6758
6759
6760 static struct reg_code_blocks *
6761 S_alloc_code_blocks(pTHX_  int ncode)
6762 {
6763      struct reg_code_blocks *cbs;
6764     Newx(cbs, 1, struct reg_code_blocks);
6765     cbs->count = ncode;
6766     cbs->refcnt = 1;
6767     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6768     if (ncode)
6769         Newx(cbs->cb, ncode, struct reg_code_block);
6770     else
6771         cbs->cb = NULL;
6772     return cbs;
6773 }
6774
6775
6776 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6777  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6778  * point to the realloced string and length.
6779  *
6780  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6781  * stuff added */
6782
6783 static void
6784 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6785                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6786 {
6787     U8 *const src = (U8*)*pat_p;
6788     U8 *dst, *d;
6789     int n=0;
6790     STRLEN s = 0;
6791     bool do_end = 0;
6792     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6793
6794     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6795         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6796
6797     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6798     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6799     d = dst;
6800
6801     while (s < *plen_p) {
6802         append_utf8_from_native_byte(src[s], &d);
6803
6804         if (n < num_code_blocks) {
6805             assert(pRExC_state->code_blocks);
6806             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6807                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6808                 assert(*(d - 1) == '(');
6809                 do_end = 1;
6810             }
6811             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6812                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6813                 assert(*(d - 1) == ')');
6814                 do_end = 0;
6815                 n++;
6816             }
6817         }
6818         s++;
6819     }
6820     *d = '\0';
6821     *plen_p = d - dst;
6822     *pat_p = (char*) dst;
6823     SAVEFREEPV(*pat_p);
6824     RExC_orig_utf8 = RExC_utf8 = 1;
6825 }
6826
6827
6828
6829 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6830  * while recording any code block indices, and handling overloading,
6831  * nested qr// objects etc.  If pat is null, it will allocate a new
6832  * string, or just return the first arg, if there's only one.
6833  *
6834  * Returns the malloced/updated pat.
6835  * patternp and pat_count is the array of SVs to be concatted;
6836  * oplist is the optional list of ops that generated the SVs;
6837  * recompile_p is a pointer to a boolean that will be set if
6838  *   the regex will need to be recompiled.
6839  * delim, if non-null is an SV that will be inserted between each element
6840  */
6841
6842 static SV*
6843 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6844                 SV *pat, SV ** const patternp, int pat_count,
6845                 OP *oplist, bool *recompile_p, SV *delim)
6846 {
6847     SV **svp;
6848     int n = 0;
6849     bool use_delim = FALSE;
6850     bool alloced = FALSE;
6851
6852     /* if we know we have at least two args, create an empty string,
6853      * then concatenate args to that. For no args, return an empty string */
6854     if (!pat && pat_count != 1) {
6855         pat = newSVpvs("");
6856         SAVEFREESV(pat);
6857         alloced = TRUE;
6858     }
6859
6860     for (svp = patternp; svp < patternp + pat_count; svp++) {
6861         SV *sv;
6862         SV *rx  = NULL;
6863         STRLEN orig_patlen = 0;
6864         bool code = 0;
6865         SV *msv = use_delim ? delim : *svp;
6866         if (!msv) msv = &PL_sv_undef;
6867
6868         /* if we've got a delimiter, we go round the loop twice for each
6869          * svp slot (except the last), using the delimiter the second
6870          * time round */
6871         if (use_delim) {
6872             svp--;
6873             use_delim = FALSE;
6874         }
6875         else if (delim)
6876             use_delim = TRUE;
6877
6878         if (SvTYPE(msv) == SVt_PVAV) {
6879             /* we've encountered an interpolated array within
6880              * the pattern, e.g. /...@a..../. Expand the list of elements,
6881              * then recursively append elements.
6882              * The code in this block is based on S_pushav() */
6883
6884             AV *const av = (AV*)msv;
6885             const SSize_t maxarg = AvFILL(av) + 1;
6886             SV **array;
6887
6888             if (oplist) {
6889                 assert(oplist->op_type == OP_PADAV
6890                     || oplist->op_type == OP_RV2AV);
6891                 oplist = OpSIBLING(oplist);
6892             }
6893
6894             if (SvRMAGICAL(av)) {
6895                 SSize_t i;
6896
6897                 Newx(array, maxarg, SV*);
6898                 SAVEFREEPV(array);
6899                 for (i=0; i < maxarg; i++) {
6900                     SV ** const svp = av_fetch(av, i, FALSE);
6901                     array[i] = svp ? *svp : &PL_sv_undef;
6902                 }
6903             }
6904             else
6905                 array = AvARRAY(av);
6906
6907             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6908                                 array, maxarg, NULL, recompile_p,
6909                                 /* $" */
6910                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6911
6912             continue;
6913         }
6914
6915
6916         /* we make the assumption here that each op in the list of
6917          * op_siblings maps to one SV pushed onto the stack,
6918          * except for code blocks, with have both an OP_NULL and
6919          * an OP_CONST.
6920          * This allows us to match up the list of SVs against the
6921          * list of OPs to find the next code block.
6922          *
6923          * Note that       PUSHMARK PADSV PADSV ..
6924          * is optimised to
6925          *                 PADRANGE PADSV  PADSV  ..
6926          * so the alignment still works. */
6927
6928         if (oplist) {
6929             if (oplist->op_type == OP_NULL
6930                 && (oplist->op_flags & OPf_SPECIAL))
6931             {
6932                 assert(n < pRExC_state->code_blocks->count);
6933                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6934                 pRExC_state->code_blocks->cb[n].block = oplist;
6935                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6936                 n++;
6937                 code = 1;
6938                 oplist = OpSIBLING(oplist); /* skip CONST */
6939                 assert(oplist);
6940             }
6941             oplist = OpSIBLING(oplist);;
6942         }
6943
6944         /* apply magic and QR overloading to arg */
6945
6946         SvGETMAGIC(msv);
6947         if (SvROK(msv) && SvAMAGIC(msv)) {
6948             SV *sv = AMG_CALLunary(msv, regexp_amg);
6949             if (sv) {
6950                 if (SvROK(sv))
6951                     sv = SvRV(sv);
6952                 if (SvTYPE(sv) != SVt_REGEXP)
6953                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6954                 msv = sv;
6955             }
6956         }
6957
6958         /* try concatenation overload ... */
6959         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6960                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6961         {
6962             sv_setsv(pat, sv);
6963             /* overloading involved: all bets are off over literal
6964              * code. Pretend we haven't seen it */
6965             if (n)
6966                 pRExC_state->code_blocks->count -= n;
6967             n = 0;
6968         }
6969         else {
6970             /* ... or failing that, try "" overload */
6971             while (SvAMAGIC(msv)
6972                     && (sv = AMG_CALLunary(msv, string_amg))
6973                     && sv != msv
6974                     &&  !(   SvROK(msv)
6975                           && SvROK(sv)
6976                           && SvRV(msv) == SvRV(sv))
6977             ) {
6978                 msv = sv;
6979                 SvGETMAGIC(msv);
6980             }
6981             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6982                 msv = SvRV(msv);
6983
6984             if (pat) {
6985                 /* this is a partially unrolled
6986                  *     sv_catsv_nomg(pat, msv);
6987                  * that allows us to adjust code block indices if
6988                  * needed */
6989                 STRLEN dlen;
6990                 char *dst = SvPV_force_nomg(pat, dlen);
6991                 orig_patlen = dlen;
6992                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6993                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6994                     sv_setpvn(pat, dst, dlen);
6995                     SvUTF8_on(pat);
6996                 }
6997                 sv_catsv_nomg(pat, msv);
6998                 rx = msv;
6999             }
7000             else {
7001                 /* We have only one SV to process, but we need to verify
7002                  * it is properly null terminated or we will fail asserts
7003                  * later. In theory we probably shouldn't get such SV's,
7004                  * but if we do we should handle it gracefully. */
7005                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7006                     /* not a string, or a string with a trailing null */
7007                     pat = msv;
7008                 } else {
7009                     /* a string with no trailing null, we need to copy it
7010                      * so it has a trailing null */
7011                     pat = sv_2mortal(newSVsv(msv));
7012                 }
7013             }
7014
7015             if (code)
7016                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7017         }
7018
7019         /* extract any code blocks within any embedded qr//'s */
7020         if (rx && SvTYPE(rx) == SVt_REGEXP
7021             && RX_ENGINE((REGEXP*)rx)->op_comp)
7022         {
7023
7024             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7025             if (ri->code_blocks && ri->code_blocks->count) {
7026                 int i;
7027                 /* the presence of an embedded qr// with code means
7028                  * we should always recompile: the text of the
7029                  * qr// may not have changed, but it may be a
7030                  * different closure than last time */
7031                 *recompile_p = 1;
7032                 if (pRExC_state->code_blocks) {
7033                     int new_count = pRExC_state->code_blocks->count
7034                             + ri->code_blocks->count;
7035                     Renew(pRExC_state->code_blocks->cb,
7036                             new_count, struct reg_code_block);
7037                     pRExC_state->code_blocks->count = new_count;
7038                 }
7039                 else
7040                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7041                                                     ri->code_blocks->count);
7042
7043                 for (i=0; i < ri->code_blocks->count; i++) {
7044                     struct reg_code_block *src, *dst;
7045                     STRLEN offset =  orig_patlen
7046                         + ReANY((REGEXP *)rx)->pre_prefix;
7047                     assert(n < pRExC_state->code_blocks->count);
7048                     src = &ri->code_blocks->cb[i];
7049                     dst = &pRExC_state->code_blocks->cb[n];
7050                     dst->start      = src->start + offset;
7051                     dst->end        = src->end   + offset;
7052                     dst->block      = src->block;
7053                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
7054                                             src->src_regex
7055                                                 ? src->src_regex
7056                                                 : (REGEXP*)rx);
7057                     n++;
7058                 }
7059             }
7060         }
7061     }
7062     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7063     if (alloced)
7064         SvSETMAGIC(pat);
7065
7066     return pat;
7067 }
7068
7069
7070
7071 /* see if there are any run-time code blocks in the pattern.
7072  * False positives are allowed */
7073
7074 static bool
7075 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7076                     char *pat, STRLEN plen)
7077 {
7078     int n = 0;
7079     STRLEN s;
7080
7081     PERL_UNUSED_CONTEXT;
7082
7083     for (s = 0; s < plen; s++) {
7084         if (   pRExC_state->code_blocks
7085             && n < pRExC_state->code_blocks->count
7086             && s == pRExC_state->code_blocks->cb[n].start)
7087         {
7088             s = pRExC_state->code_blocks->cb[n].end;
7089             n++;
7090             continue;
7091         }
7092         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7093          * positives here */
7094         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7095             (pat[s+2] == '{'
7096                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7097         )
7098             return 1;
7099     }
7100     return 0;
7101 }
7102
7103 /* Handle run-time code blocks. We will already have compiled any direct
7104  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7105  * copy of it, but with any literal code blocks blanked out and
7106  * appropriate chars escaped; then feed it into
7107  *
7108  *    eval "qr'modified_pattern'"
7109  *
7110  * For example,
7111  *
7112  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7113  *
7114  * becomes
7115  *
7116  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7117  *
7118  * After eval_sv()-ing that, grab any new code blocks from the returned qr
7119  * and merge them with any code blocks of the original regexp.
7120  *
7121  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7122  * instead, just save the qr and return FALSE; this tells our caller that
7123  * the original pattern needs upgrading to utf8.
7124  */
7125
7126 static bool
7127 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7128     char *pat, STRLEN plen)
7129 {
7130     SV *qr;
7131
7132     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7133
7134     if (pRExC_state->runtime_code_qr) {
7135         /* this is the second time we've been called; this should
7136          * only happen if the main pattern got upgraded to utf8
7137          * during compilation; re-use the qr we compiled first time
7138          * round (which should be utf8 too)
7139          */
7140         qr = pRExC_state->runtime_code_qr;
7141         pRExC_state->runtime_code_qr = NULL;
7142         assert(RExC_utf8 && SvUTF8(qr));
7143     }
7144     else {
7145         int n = 0;
7146         STRLEN s;
7147         char *p, *newpat;
7148         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7149         SV *sv, *qr_ref;
7150         dSP;
7151
7152         /* determine how many extra chars we need for ' and \ escaping */
7153         for (s = 0; s < plen; s++) {
7154             if (pat[s] == '\'' || pat[s] == '\\')
7155                 newlen++;
7156         }
7157
7158         Newx(newpat, newlen, char);
7159         p = newpat;
7160         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7161
7162         for (s = 0; s < plen; s++) {
7163             if (   pRExC_state->code_blocks
7164                 && n < pRExC_state->code_blocks->count
7165                 && s == pRExC_state->code_blocks->cb[n].start)
7166             {
7167                 /* blank out literal code block so that they aren't
7168                  * recompiled: eg change from/to:
7169                  *     /(?{xyz})/
7170                  *     /(?=====)/
7171                  * and
7172                  *     /(??{xyz})/
7173                  *     /(?======)/
7174                  * and
7175                  *     /(?(?{xyz}))/
7176                  *     /(?(?=====))/
7177                 */
7178                 assert(pat[s]   == '(');
7179                 assert(pat[s+1] == '?');
7180                 *p++ = '(';
7181                 *p++ = '?';
7182                 s += 2;
7183                 while (s < pRExC_state->code_blocks->cb[n].end) {
7184                     *p++ = '=';
7185                     s++;
7186                 }
7187                 *p++ = ')';
7188                 n++;
7189                 continue;
7190             }
7191             if (pat[s] == '\'' || pat[s] == '\\')
7192                 *p++ = '\\';
7193             *p++ = pat[s];
7194         }
7195         *p++ = '\'';
7196         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7197             *p++ = 'x';
7198             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7199                 *p++ = 'x';
7200             }
7201         }
7202         *p++ = '\0';
7203         DEBUG_COMPILE_r({
7204             Perl_re_printf( aTHX_
7205                 "%sre-parsing pattern for runtime code:%s %s\n",
7206                 PL_colors[4], PL_colors[5], newpat);
7207         });
7208
7209         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7210         Safefree(newpat);
7211
7212         ENTER;
7213         SAVETMPS;
7214         save_re_context();
7215         PUSHSTACKi(PERLSI_REQUIRE);
7216         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7217          * parsing qr''; normally only q'' does this. It also alters
7218          * hints handling */
7219         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7220         SvREFCNT_dec_NN(sv);
7221         SPAGAIN;
7222         qr_ref = POPs;
7223         PUTBACK;
7224         {
7225             SV * const errsv = ERRSV;
7226             if (SvTRUE_NN(errsv))
7227                 /* use croak_sv ? */
7228                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7229         }
7230         assert(SvROK(qr_ref));
7231         qr = SvRV(qr_ref);
7232         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7233         /* the leaving below frees the tmp qr_ref.
7234          * Give qr a life of its own */
7235         SvREFCNT_inc(qr);
7236         POPSTACK;
7237         FREETMPS;
7238         LEAVE;
7239
7240     }
7241
7242     if (!RExC_utf8 && SvUTF8(qr)) {
7243         /* first time through; the pattern got upgraded; save the
7244          * qr for the next time through */
7245         assert(!pRExC_state->runtime_code_qr);
7246         pRExC_state->runtime_code_qr = qr;
7247         return 0;
7248     }
7249
7250
7251     /* extract any code blocks within the returned qr//  */
7252
7253
7254     /* merge the main (r1) and run-time (r2) code blocks into one */
7255     {
7256         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7257         struct reg_code_block *new_block, *dst;
7258         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7259         int i1 = 0, i2 = 0;
7260         int r1c, r2c;
7261
7262         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7263         {
7264             SvREFCNT_dec_NN(qr);
7265             return 1;
7266         }
7267
7268         if (!r1->code_blocks)
7269             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7270
7271         r1c = r1->code_blocks->count;
7272         r2c = r2->code_blocks->count;
7273
7274         Newx(new_block, r1c + r2c, struct reg_code_block);
7275
7276         dst = new_block;
7277
7278         while (i1 < r1c || i2 < r2c) {
7279             struct reg_code_block *src;
7280             bool is_qr = 0;
7281
7282             if (i1 == r1c) {
7283                 src = &r2->code_blocks->cb[i2++];
7284                 is_qr = 1;
7285             }
7286             else if (i2 == r2c)
7287                 src = &r1->code_blocks->cb[i1++];
7288             else if (  r1->code_blocks->cb[i1].start
7289                      < r2->code_blocks->cb[i2].start)
7290             {
7291                 src = &r1->code_blocks->cb[i1++];
7292                 assert(src->end < r2->code_blocks->cb[i2].start);
7293             }
7294             else {
7295                 assert(  r1->code_blocks->cb[i1].start
7296                        > r2->code_blocks->cb[i2].start);
7297                 src = &r2->code_blocks->cb[i2++];
7298                 is_qr = 1;
7299                 assert(src->end < r1->code_blocks->cb[i1].start);
7300             }
7301
7302             assert(pat[src->start] == '(');
7303             assert(pat[src->end]   == ')');
7304             dst->start      = src->start;
7305             dst->end        = src->end;
7306             dst->block      = src->block;
7307             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7308                                     : src->src_regex;
7309             dst++;
7310         }
7311         r1->code_blocks->count += r2c;
7312         Safefree(r1->code_blocks->cb);
7313         r1->code_blocks->cb = new_block;
7314     }
7315
7316     SvREFCNT_dec_NN(qr);
7317     return 1;
7318 }
7319
7320
7321 STATIC bool
7322 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7323                       struct reg_substr_datum  *rsd,
7324                       struct scan_data_substrs *sub,
7325                       STRLEN longest_length)
7326 {
7327     /* This is the common code for setting up the floating and fixed length
7328      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7329      * as to whether succeeded or not */
7330
7331     I32 t;
7332     SSize_t ml;
7333     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7334     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7335
7336     if (! (longest_length
7337            || (eol /* Can't have SEOL and MULTI */
7338                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7339           )
7340             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7341         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7342     {
7343         return FALSE;
7344     }
7345
7346     /* copy the information about the longest from the reg_scan_data
7347         over to the program. */
7348     if (SvUTF8(sub->str)) {
7349         rsd->substr      = NULL;
7350         rsd->utf8_substr = sub->str;
7351     } else {
7352         rsd->substr      = sub->str;
7353         rsd->utf8_substr = NULL;
7354     }
7355     /* end_shift is how many chars that must be matched that
7356         follow this item. We calculate it ahead of time as once the
7357         lookbehind offset is added in we lose the ability to correctly
7358         calculate it.*/
7359     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7360     rsd->end_shift = ml - sub->min_offset
7361         - longest_length
7362             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7363              * intead? - DAPM
7364             + (SvTAIL(sub->str) != 0)
7365             */
7366         + sub->lookbehind;
7367
7368     t = (eol/* Can't have SEOL and MULTI */
7369          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7370     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7371
7372     return TRUE;
7373 }
7374
7375 STATIC void
7376 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7377 {
7378     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7379      * properly wrapped with the right modifiers */
7380
7381     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7382     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7383                                                 != REGEX_DEPENDS_CHARSET);
7384
7385     /* The caret is output if there are any defaults: if not all the STD
7386         * flags are set, or if no character set specifier is needed */
7387     bool has_default =
7388                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7389                 || ! has_charset);
7390     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7391                                                 == REG_RUN_ON_COMMENT_SEEN);
7392     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7393                         >> RXf_PMf_STD_PMMOD_SHIFT);
7394     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7395     char *p;
7396     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7397
7398     /* We output all the necessary flags; we never output a minus, as all
7399         * those are defaults, so are
7400         * covered by the caret */
7401     const STRLEN wraplen = pat_len + has_p + has_runon
7402         + has_default       /* If needs a caret */
7403         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7404
7405             /* If needs a character set specifier */
7406         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7407         + (sizeof("(?:)") - 1);
7408
7409     PERL_ARGS_ASSERT_SET_REGEX_PV;
7410
7411     /* make sure PL_bitcount bounds not exceeded */
7412     STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7413
7414     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7415     SvPOK_on(Rx);
7416     if (RExC_utf8)
7417         SvFLAGS(Rx) |= SVf_UTF8;
7418     *p++='('; *p++='?';
7419
7420     /* If a default, cover it using the caret */
7421     if (has_default) {
7422         *p++= DEFAULT_PAT_MOD;
7423     }
7424     if (has_charset) {
7425         STRLEN len;
7426         const char* name;
7427
7428         name = get_regex_charset_name(RExC_rx->extflags, &len);
7429         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7430             assert(RExC_utf8);
7431             name = UNICODE_PAT_MODS;
7432             len = sizeof(UNICODE_PAT_MODS) - 1;
7433         }
7434         Copy(name, p, len, char);
7435         p += len;
7436     }
7437     if (has_p)
7438         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7439     {
7440         char ch;
7441         while((ch = *fptr++)) {
7442             if(reganch & 1)
7443                 *p++ = ch;
7444             reganch >>= 1;
7445         }
7446     }
7447
7448     *p++ = ':';
7449     Copy(RExC_precomp, p, pat_len, char);
7450     assert ((RX_WRAPPED(Rx) - p) < 16);
7451     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7452     p += pat_len;
7453
7454     /* Adding a trailing \n causes this to compile properly:
7455             my $R = qr / A B C # D E/x; /($R)/
7456         Otherwise the parens are considered part of the comment */
7457     if (has_runon)
7458         *p++ = '\n';
7459     *p++ = ')';
7460     *p = 0;
7461     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7462 }
7463
7464 /*
7465  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7466  * regular expression into internal code.
7467  * The pattern may be passed either as:
7468  *    a list of SVs (patternp plus pat_count)
7469  *    a list of OPs (expr)
7470  * If both are passed, the SV list is used, but the OP list indicates
7471  * which SVs are actually pre-compiled code blocks
7472  *
7473  * The SVs in the list have magic and qr overloading applied to them (and
7474  * the list may be modified in-place with replacement SVs in the latter
7475  * case).
7476  *
7477  * If the pattern hasn't changed from old_re, then old_re will be
7478  * returned.
7479  *
7480  * eng is the current engine. If that engine has an op_comp method, then
7481  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7482  * do the initial concatenation of arguments and pass on to the external
7483  * engine.
7484  *
7485  * If is_bare_re is not null, set it to a boolean indicating whether the
7486  * arg list reduced (after overloading) to a single bare regex which has
7487  * been returned (i.e. /$qr/).
7488  *
7489  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7490  *
7491  * pm_flags contains the PMf_* flags, typically based on those from the
7492  * pm_flags field of the related PMOP. Currently we're only interested in
7493  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7494  *
7495  * For many years this code had an initial sizing pass that calculated
7496  * (sometimes incorrectly, leading to security holes) the size needed for the
7497  * compiled pattern.  That was changed by commit
7498  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7499  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7500  * references to this sizing pass.
7501  *
7502  * Now, an initial crude guess as to the size needed is made, based on the
7503  * length of the pattern.  Patches welcome to improve that guess.  That amount
7504  * of space is malloc'd and then immediately freed, and then clawed back node
7505  * by node.  This design is to minimze, to the extent possible, memory churn
7506  * when doing the reallocs.
7507  *
7508  * A separate parentheses counting pass may be needed in some cases.
7509  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7510  * of these cases.
7511  *
7512  * The existence of a sizing pass necessitated design decisions that are no
7513  * longer needed.  There are potential areas of simplification.
7514  *
7515  * Beware that the optimization-preparation code in here knows about some
7516  * of the structure of the compiled regexp.  [I'll say.]
7517  */
7518
7519 REGEXP *
7520 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7521                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7522                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7523 {
7524     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7525     STRLEN plen;
7526     char *exp;
7527     regnode *scan;
7528     I32 flags;
7529     SSize_t minlen = 0;
7530     U32 rx_flags;
7531     SV *pat;
7532     SV** new_patternp = patternp;
7533
7534     /* these are all flags - maybe they should be turned
7535      * into a single int with different bit masks */
7536     I32 sawlookahead = 0;
7537     I32 sawplus = 0;
7538     I32 sawopen = 0;
7539     I32 sawminmod = 0;
7540
7541     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7542     bool recompile = 0;
7543     bool runtime_code = 0;
7544     scan_data_t data;
7545     RExC_state_t RExC_state;
7546     RExC_state_t * const pRExC_state = &RExC_state;
7547 #ifdef TRIE_STUDY_OPT
7548     int restudied = 0;
7549     RExC_state_t copyRExC_state;
7550 #endif
7551     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7552
7553     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7554
7555     DEBUG_r(if (!PL_colorset) reginitcolors());
7556
7557
7558     pRExC_state->warn_text = NULL;
7559     pRExC_state->unlexed_names = NULL;
7560     pRExC_state->code_blocks = NULL;
7561
7562     if (is_bare_re)
7563         *is_bare_re = FALSE;
7564
7565     if (expr && (expr->op_type == OP_LIST ||
7566                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7567         /* allocate code_blocks if needed */
7568         OP *o;
7569         int ncode = 0;
7570
7571         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7572             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7573                 ncode++; /* count of DO blocks */
7574
7575         if (ncode)
7576             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7577     }
7578
7579     if (!pat_count) {
7580         /* compile-time pattern with just OP_CONSTs and DO blocks */
7581
7582         int n;
7583         OP *o;
7584
7585         /* find how many CONSTs there are */
7586         assert(expr);
7587         n = 0;
7588         if (expr->op_type == OP_CONST)
7589             n = 1;
7590         else
7591             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7592                 if (o->op_type == OP_CONST)
7593                     n++;
7594             }
7595
7596         /* fake up an SV array */
7597
7598         assert(!new_patternp);
7599         Newx(new_patternp, n, SV*);
7600         SAVEFREEPV(new_patternp);
7601         pat_count = n;
7602
7603         n = 0;
7604         if (expr->op_type == OP_CONST)
7605             new_patternp[n] = cSVOPx_sv(expr);
7606         else
7607             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7608                 if (o->op_type == OP_CONST)
7609                     new_patternp[n++] = cSVOPo_sv;
7610             }
7611
7612     }
7613
7614     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7615         "Assembling pattern from %d elements%s\n", pat_count,
7616             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7617
7618     /* set expr to the first arg op */
7619
7620     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7621          && expr->op_type != OP_CONST)
7622     {
7623             expr = cLISTOPx(expr)->op_first;
7624             assert(   expr->op_type == OP_PUSHMARK
7625                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7626                    || expr->op_type == OP_PADRANGE);
7627             expr = OpSIBLING(expr);
7628     }
7629
7630     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7631                         expr, &recompile, NULL);
7632
7633     /* handle bare (possibly after overloading) regex: foo =~ $re */
7634     {
7635         SV *re = pat;
7636         if (SvROK(re))
7637             re = SvRV(re);
7638         if (SvTYPE(re) == SVt_REGEXP) {
7639             if (is_bare_re)
7640                 *is_bare_re = TRUE;
7641             SvREFCNT_inc(re);
7642             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7643                 "Precompiled pattern%s\n",
7644                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7645
7646             return (REGEXP*)re;
7647         }
7648     }
7649
7650     exp = SvPV_nomg(pat, plen);
7651
7652     if (!eng->op_comp) {
7653         if ((SvUTF8(pat) && IN_BYTES)
7654                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7655         {
7656             /* make a temporary copy; either to convert to bytes,
7657              * or to avoid repeating get-magic / overloaded stringify */
7658             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7659                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7660         }
7661         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7662     }
7663
7664     /* ignore the utf8ness if the pattern is 0 length */
7665     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7666     RExC_uni_semantics = 0;
7667     RExC_contains_locale = 0;
7668     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7669     RExC_in_script_run = 0;
7670     RExC_study_started = 0;
7671     pRExC_state->runtime_code_qr = NULL;
7672     RExC_frame_head= NULL;
7673     RExC_frame_last= NULL;
7674     RExC_frame_count= 0;
7675     RExC_latest_warn_offset = 0;
7676     RExC_use_BRANCHJ = 0;
7677     RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7678     RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7679     RExC_total_parens = 0;
7680     RExC_open_parens = NULL;
7681     RExC_close_parens = NULL;
7682     RExC_paren_names = NULL;
7683     RExC_size = 0;
7684     RExC_seen_d_op = FALSE;
7685 #ifdef DEBUGGING
7686     RExC_paren_name_list = NULL;
7687 #endif
7688
7689     DEBUG_r({
7690         RExC_mysv1= sv_newmortal();
7691         RExC_mysv2= sv_newmortal();
7692     });
7693
7694     DEBUG_COMPILE_r({
7695             SV *dsv= sv_newmortal();
7696             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7697             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7698                           PL_colors[4], PL_colors[5], s);
7699         });
7700
7701     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7702      * to utf8 */
7703
7704     if ((pm_flags & PMf_USE_RE_EVAL)
7705                 /* this second condition covers the non-regex literal case,
7706                  * i.e.  $foo =~ '(?{})'. */
7707                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7708     )
7709         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7710
7711   redo_parse:
7712     /* return old regex if pattern hasn't changed */
7713     /* XXX: note in the below we have to check the flags as well as the
7714      * pattern.
7715      *
7716      * Things get a touch tricky as we have to compare the utf8 flag
7717      * independently from the compile flags.  */
7718
7719     if (   old_re
7720         && !recompile
7721         && !!RX_UTF8(old_re) == !!RExC_utf8
7722         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7723         && RX_PRECOMP(old_re)
7724         && RX_PRELEN(old_re) == plen
7725         && memEQ(RX_PRECOMP(old_re), exp, plen)
7726         && !runtime_code /* with runtime code, always recompile */ )
7727     {
7728         DEBUG_COMPILE_r({
7729             SV *dsv= sv_newmortal();
7730             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7731             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
7732                           PL_colors[4], PL_colors[5], s);
7733         });
7734         return old_re;
7735     }
7736
7737     /* Allocate the pattern's SV */
7738     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7739     RExC_rx = ReANY(Rx);
7740     if ( RExC_rx == NULL )
7741         FAIL("Regexp out of space");
7742
7743     rx_flags = orig_rx_flags;
7744
7745     if (   (UTF || RExC_uni_semantics)
7746         && initial_charset == REGEX_DEPENDS_CHARSET)
7747     {
7748
7749         /* Set to use unicode semantics if the pattern is in utf8 and has the
7750          * 'depends' charset specified, as it means unicode when utf8  */
7751         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7752         RExC_uni_semantics = 1;
7753     }
7754
7755     RExC_pm_flags = pm_flags;
7756
7757     if (runtime_code) {
7758         assert(TAINTING_get || !TAINT_get);
7759         if (TAINT_get)
7760             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7761
7762         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7763             /* whoops, we have a non-utf8 pattern, whilst run-time code
7764              * got compiled as utf8. Try again with a utf8 pattern */
7765             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7766                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7767             goto redo_parse;
7768         }
7769     }
7770     assert(!pRExC_state->runtime_code_qr);
7771
7772     RExC_sawback = 0;
7773
7774     RExC_seen = 0;
7775     RExC_maxlen = 0;
7776     RExC_in_lookbehind = 0;
7777     RExC_in_lookahead = 0;
7778     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7779     RExC_recode_x_to_native = 0;
7780     RExC_in_multi_char_class = 0;
7781
7782     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7783     RExC_precomp_end = RExC_end = exp + plen;
7784     RExC_nestroot = 0;
7785     RExC_whilem_seen = 0;
7786     RExC_end_op = NULL;
7787     RExC_recurse = NULL;
7788     RExC_study_chunk_recursed = NULL;
7789     RExC_study_chunk_recursed_bytes= 0;
7790     RExC_recurse_count = 0;
7791     RExC_sets_depth = 0;
7792     pRExC_state->code_index = 0;
7793
7794     /* Initialize the string in the compiled pattern.  This is so that there is
7795      * something to output if necessary */
7796     set_regex_pv(pRExC_state, Rx);
7797
7798     DEBUG_PARSE_r({
7799         Perl_re_printf( aTHX_
7800             "Starting parse and generation\n");
7801         RExC_lastnum=0;
7802         RExC_lastparse=NULL;
7803     });
7804
7805     /* Allocate space and zero-initialize. Note, the two step process
7806        of zeroing when in debug mode, thus anything assigned has to
7807        happen after that */
7808     if (!  RExC_size) {
7809
7810         /* On the first pass of the parse, we guess how big this will be.  Then
7811          * we grow in one operation to that amount and then give it back.  As
7812          * we go along, we re-allocate what we need.
7813          *
7814          * XXX Currently the guess is essentially that the pattern will be an
7815          * EXACT node with one byte input, one byte output.  This is crude, and
7816          * better heuristics are welcome.
7817          *
7818          * On any subsequent passes, we guess what we actually computed in the
7819          * latest earlier pass.  Such a pass probably didn't complete so is
7820          * missing stuff.  We could improve those guesses by knowing where the
7821          * parse stopped, and use the length so far plus apply the above
7822          * assumption to what's left. */
7823         RExC_size = STR_SZ(RExC_end - RExC_start);
7824     }
7825
7826     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7827     if ( RExC_rxi == NULL )
7828         FAIL("Regexp out of space");
7829
7830     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7831     RXi_SET( RExC_rx, RExC_rxi );
7832
7833     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7834      * node parsed will give back any excess memory we have allocated so far).
7835      * */
7836     RExC_size = 0;
7837
7838     /* non-zero initialization begins here */
7839     RExC_rx->engine= eng;
7840     RExC_rx->extflags = rx_flags;
7841     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7842
7843     if (pm_flags & PMf_IS_QR) {
7844         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7845         if (RExC_rxi->code_blocks) {
7846             RExC_rxi->code_blocks->refcnt++;
7847         }
7848     }
7849
7850     RExC_rx->intflags = 0;
7851
7852     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7853     RExC_parse = exp;
7854
7855     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7856      * code makes sure the final byte is an uncounted NUL.  But should this
7857      * ever not be the case, lots of things could read beyond the end of the
7858      * buffer: loops like
7859      *      while(isFOO(*RExC_parse)) RExC_parse++;
7860      *      strchr(RExC_parse, "foo");
7861      * etc.  So it is worth noting. */
7862     assert(*RExC_end == '\0');
7863
7864     RExC_naughty = 0;
7865     RExC_npar = 1;
7866     RExC_parens_buf_size = 0;
7867     RExC_emit_start = RExC_rxi->program;
7868     pRExC_state->code_index = 0;
7869
7870     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7871     RExC_emit = 1;
7872
7873     /* Do the parse */
7874     if (reg(pRExC_state, 0, &flags, 1)) {
7875
7876         /* Success!, But we may need to redo the parse knowing how many parens
7877          * there actually are */
7878         if (IN_PARENS_PASS) {
7879             flags |= RESTART_PARSE;
7880         }
7881
7882         /* We have that number in RExC_npar */
7883         RExC_total_parens = RExC_npar;
7884     }
7885     else if (! MUST_RESTART(flags)) {
7886         ReREFCNT_dec(Rx);
7887         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7888     }
7889
7890     /* Here, we either have success, or we have to redo the parse for some reason */
7891     if (MUST_RESTART(flags)) {
7892
7893         /* It's possible to write a regexp in ascii that represents Unicode
7894         codepoints outside of the byte range, such as via \x{100}. If we
7895         detect such a sequence we have to convert the entire pattern to utf8
7896         and then recompile, as our sizing calculation will have been based
7897         on 1 byte == 1 character, but we will need to use utf8 to encode
7898         at least some part of the pattern, and therefore must convert the whole
7899         thing.
7900         -- dmq */
7901         if (flags & NEED_UTF8) {
7902
7903             /* We have stored the offset of the final warning output so far.
7904              * That must be adjusted.  Any variant characters between the start
7905              * of the pattern and this warning count for 2 bytes in the final,
7906              * so just add them again */
7907             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7908                 RExC_latest_warn_offset +=
7909                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7910                                                 + RExC_latest_warn_offset);
7911             }
7912             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7913             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7914             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7915         }
7916         else {
7917             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7918         }
7919
7920         if (ALL_PARENS_COUNTED) {
7921             /* Make enough room for all the known parens, and zero it */
7922             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7923             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7924             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7925
7926             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7927             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7928         }
7929         else { /* Parse did not complete.  Reinitialize the parentheses
7930                   structures */
7931             RExC_total_parens = 0;
7932             if (RExC_open_parens) {
7933                 Safefree(RExC_open_parens);
7934                 RExC_open_parens = NULL;
7935             }
7936             if (RExC_close_parens) {
7937                 Safefree(RExC_close_parens);
7938                 RExC_close_parens = NULL;
7939             }
7940         }
7941
7942         /* Clean up what we did in this parse */
7943         SvREFCNT_dec_NN(RExC_rx_sv);
7944
7945         goto redo_parse;
7946     }
7947
7948     /* Here, we have successfully parsed and generated the pattern's program
7949      * for the regex engine.  We are ready to finish things up and look for
7950      * optimizations. */
7951
7952     /* Update the string to compile, with correct modifiers, etc */
7953     set_regex_pv(pRExC_state, Rx);
7954
7955     RExC_rx->nparens = RExC_total_parens - 1;
7956
7957     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7958     if (RExC_whilem_seen > 15)
7959         RExC_whilem_seen = 15;
7960
7961     DEBUG_PARSE_r({
7962         Perl_re_printf( aTHX_
7963             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7964         RExC_lastnum=0;
7965         RExC_lastparse=NULL;
7966     });
7967
7968 #ifdef RE_TRACK_PATTERN_OFFSETS
7969     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7970                           "%s %" UVuf " bytes for offset annotations.\n",
7971                           RExC_offsets ? "Got" : "Couldn't get",
7972                           (UV)((RExC_offsets[0] * 2 + 1))));
7973     DEBUG_OFFSETS_r(if (RExC_offsets) {
7974         const STRLEN len = RExC_offsets[0];
7975         STRLEN i;
7976         DECLARE_AND_GET_RE_DEBUG_FLAGS;
7977         Perl_re_printf( aTHX_
7978                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7979         for (i = 1; i <= len; i++) {
7980             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7981                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7982                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7983         }
7984         Perl_re_printf( aTHX_  "\n");
7985     });
7986
7987 #else
7988     SetProgLen(RExC_rxi,RExC_size);
7989 #endif
7990
7991     DEBUG_DUMP_PRE_OPTIMIZE_r({
7992         SV * const sv = sv_newmortal();
7993         RXi_GET_DECL(RExC_rx, ri);
7994         DEBUG_RExC_seen();
7995         Perl_re_printf( aTHX_ "Program before optimization:\n");
7996
7997         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7998                         sv, 0, 0);
7999     });
8000
8001     DEBUG_OPTIMISE_r(
8002         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
8003     );
8004
8005     /* XXXX To minimize changes to RE engine we always allocate
8006        3-units-long substrs field. */
8007     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8008     if (RExC_recurse_count) {
8009         Newx(RExC_recurse, RExC_recurse_count, regnode *);
8010         SAVEFREEPV(RExC_recurse);
8011     }
8012
8013     if (RExC_seen & REG_RECURSE_SEEN) {
8014         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8015          * So its 1 if there are no parens. */
8016         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8017                                          ((RExC_total_parens & 0x07) != 0);
8018         Newx(RExC_study_chunk_recursed,
8019              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8020         SAVEFREEPV(RExC_study_chunk_recursed);
8021     }
8022
8023   reStudy:
8024     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8025     DEBUG_r(
8026         RExC_study_chunk_recursed_count= 0;
8027     );
8028     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8029     if (RExC_study_chunk_recursed) {
8030         Zero(RExC_study_chunk_recursed,
8031              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8032     }
8033
8034
8035 #ifdef TRIE_STUDY_OPT
8036     if (!restudied) {
8037         StructCopy(&zero_scan_data, &data, scan_data_t);
8038         copyRExC_state = RExC_state;
8039     } else {
8040         U32 seen=RExC_seen;
8041         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8042
8043         RExC_state = copyRExC_state;
8044         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8045             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8046         else
8047             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8048         StructCopy(&zero_scan_data, &data, scan_data_t);
8049     }
8050 #else
8051     StructCopy(&zero_scan_data, &data, scan_data_t);
8052 #endif
8053
8054     /* Dig out information for optimizations. */
8055     RExC_rx->extflags = RExC_flags; /* was pm_op */
8056     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8057
8058     if (UTF)
8059         SvUTF8_on(Rx);  /* Unicode in it? */
8060     RExC_rxi->regstclass = NULL;
8061     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
8062         RExC_rx->intflags |= PREGf_NAUGHTY;
8063     scan = RExC_rxi->program + 1;               /* First BRANCH. */
8064
8065     /* testing for BRANCH here tells us whether there is "must appear"
8066        data in the pattern. If there is then we can use it for optimisations */
8067     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
8068                                                   */
8069         SSize_t fake;
8070         STRLEN longest_length[2];
8071         regnode_ssc ch_class; /* pointed to by data */
8072         int stclass_flag;
8073         SSize_t last_close = 0; /* pointed to by data */
8074         regnode *first= scan;
8075         regnode *first_next= regnext(first);
8076         int i;
8077
8078         /*
8079          * Skip introductions and multiplicators >= 1
8080          * so that we can extract the 'meat' of the pattern that must
8081          * match in the large if() sequence following.
8082          * NOTE that EXACT is NOT covered here, as it is normally
8083          * picked up by the optimiser separately.
8084          *
8085          * This is unfortunate as the optimiser isnt handling lookahead
8086          * properly currently.
8087          *
8088          */
8089         while ((OP(first) == OPEN && (sawopen = 1)) ||
8090                /* An OR of *one* alternative - should not happen now. */
8091             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8092             /* for now we can't handle lookbehind IFMATCH*/
8093             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8094             (OP(first) == PLUS) ||
8095             (OP(first) == MINMOD) ||
8096                /* An {n,m} with n>0 */
8097             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8098             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8099         {
8100                 /*
8101                  * the only op that could be a regnode is PLUS, all the rest
8102                  * will be regnode_1 or regnode_2.
8103                  *
8104                  * (yves doesn't think this is true)
8105                  */
8106                 if (OP(first) == PLUS)
8107                     sawplus = 1;
8108                 else {
8109                     if (OP(first) == MINMOD)
8110                         sawminmod = 1;
8111                     first += regarglen[OP(first)];
8112                 }
8113                 first = NEXTOPER(first);
8114                 first_next= regnext(first);
8115         }
8116
8117         /* Starting-point info. */
8118       again:
8119         DEBUG_PEEP("first:", first, 0, 0);
8120         /* Ignore EXACT as we deal with it later. */
8121         if (PL_regkind[OP(first)] == EXACT) {
8122             if (   OP(first) == EXACT
8123                 || OP(first) == LEXACT
8124                 || OP(first) == EXACT_REQ8
8125                 || OP(first) == LEXACT_REQ8
8126                 || OP(first) == EXACTL)
8127             {
8128                 NOOP;   /* Empty, get anchored substr later. */
8129             }
8130             else
8131                 RExC_rxi->regstclass = first;
8132         }
8133 #ifdef TRIE_STCLASS
8134         else if (PL_regkind[OP(first)] == TRIE &&
8135                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8136         {
8137             /* this can happen only on restudy */
8138             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8139         }
8140 #endif
8141         else if (REGNODE_SIMPLE(OP(first)))
8142             RExC_rxi->regstclass = first;
8143         else if (PL_regkind[OP(first)] == BOUND ||
8144                  PL_regkind[OP(first)] == NBOUND)
8145             RExC_rxi->regstclass = first;
8146         else if (PL_regkind[OP(first)] == BOL) {
8147             RExC_rx->intflags |= (OP(first) == MBOL
8148                            ? PREGf_ANCH_MBOL
8149                            : PREGf_ANCH_SBOL);
8150             first = NEXTOPER(first);
8151             goto again;
8152         }
8153         else if (OP(first) == GPOS) {
8154             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8155             first = NEXTOPER(first);
8156             goto again;
8157         }
8158         else if ((!sawopen || !RExC_sawback) &&
8159             !sawlookahead &&
8160             (OP(first) == STAR &&
8161             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8162             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8163         {
8164             /* turn .* into ^.* with an implied $*=1 */
8165             const int type =
8166                 (OP(NEXTOPER(first)) == REG_ANY)
8167                     ? PREGf_ANCH_MBOL
8168                     : PREGf_ANCH_SBOL;
8169             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8170             first = NEXTOPER(first);
8171             goto again;
8172         }
8173         if (sawplus && !sawminmod && !sawlookahead
8174             && (!sawopen || !RExC_sawback)
8175             && !pRExC_state->code_blocks) /* May examine pos and $& */
8176             /* x+ must match at the 1st pos of run of x's */
8177             RExC_rx->intflags |= PREGf_SKIP;
8178
8179         /* Scan is after the zeroth branch, first is atomic matcher. */
8180 #ifdef TRIE_STUDY_OPT
8181         DEBUG_PARSE_r(
8182             if (!restudied)
8183                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8184                               (IV)(first - scan + 1))
8185         );
8186 #else
8187         DEBUG_PARSE_r(
8188             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8189                 (IV)(first - scan + 1))
8190         );
8191 #endif
8192
8193
8194         /*
8195         * If there's something expensive in the r.e., find the
8196         * longest literal string that must appear and make it the
8197         * regmust.  Resolve ties in favor of later strings, since
8198         * the regstart check works with the beginning of the r.e.
8199         * and avoiding duplication strengthens checking.  Not a
8200         * strong reason, but sufficient in the absence of others.
8201         * [Now we resolve ties in favor of the earlier string if
8202         * it happens that c_offset_min has been invalidated, since the
8203         * earlier string may buy us something the later one won't.]
8204         */
8205
8206         data.substrs[0].str = newSVpvs("");
8207         data.substrs[1].str = newSVpvs("");
8208         data.last_found = newSVpvs("");
8209         data.cur_is_floating = 0; /* initially any found substring is fixed */
8210         ENTER_with_name("study_chunk");
8211         SAVEFREESV(data.substrs[0].str);
8212         SAVEFREESV(data.substrs[1].str);
8213         SAVEFREESV(data.last_found);
8214         first = scan;
8215         if (!RExC_rxi->regstclass) {
8216             ssc_init(pRExC_state, &ch_class);
8217             data.start_class = &ch_class;
8218             stclass_flag = SCF_DO_STCLASS_AND;
8219         } else                          /* XXXX Check for BOUND? */
8220             stclass_flag = 0;
8221         data.last_closep = &last_close;
8222
8223         DEBUG_RExC_seen();
8224         /*
8225          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8226          * (NO top level branches)
8227          */
8228         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8229                              scan + RExC_size, /* Up to end */
8230             &data, -1, 0, NULL,
8231             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8232                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8233             0, TRUE);
8234
8235
8236         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8237
8238
8239         if ( RExC_total_parens == 1 && !data.cur_is_floating
8240              && data.last_start_min == 0 && data.last_end > 0
8241              && !RExC_seen_zerolen
8242              && !(RExC_seen & REG_VERBARG_SEEN)
8243              && !(RExC_seen & REG_GPOS_SEEN)
8244         ){
8245             RExC_rx->extflags |= RXf_CHECK_ALL;
8246         }
8247         scan_commit(pRExC_state, &data,&minlen, 0);
8248
8249
8250         /* XXX this is done in reverse order because that's the way the
8251          * code was before it was parameterised. Don't know whether it
8252          * actually needs doing in reverse order. DAPM */
8253         for (i = 1; i >= 0; i--) {
8254             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8255
8256             if (   !(   i
8257                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8258                      &&    data.substrs[0].min_offset
8259                         == data.substrs[1].min_offset
8260                      &&    SvCUR(data.substrs[0].str)
8261                         == SvCUR(data.substrs[1].str)
8262                     )
8263                 && S_setup_longest (aTHX_ pRExC_state,
8264                                         &(RExC_rx->substrs->data[i]),
8265                                         &(data.substrs[i]),
8266                                         longest_length[i]))
8267             {
8268                 RExC_rx->substrs->data[i].min_offset =
8269                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8270
8271                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8272                 /* Don't offset infinity */
8273                 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8274                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8275                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8276             }
8277             else {
8278                 RExC_rx->substrs->data[i].substr      = NULL;
8279                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8280                 longest_length[i] = 0;
8281             }
8282         }
8283
8284         LEAVE_with_name("study_chunk");
8285
8286         if (RExC_rxi->regstclass
8287             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8288             RExC_rxi->regstclass = NULL;
8289
8290         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8291               || RExC_rx->substrs->data[0].min_offset)
8292             && stclass_flag
8293             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8294             && is_ssc_worth_it(pRExC_state, data.start_class))
8295         {
8296             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8297
8298             ssc_finalize(pRExC_state, data.start_class);
8299
8300             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8301             StructCopy(data.start_class,
8302                        (regnode_ssc*)RExC_rxi->data->data[n],
8303                        regnode_ssc);
8304             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8305             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8306             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8307                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8308                       Perl_re_printf( aTHX_
8309                                     "synthetic stclass \"%s\".\n",
8310                                     SvPVX_const(sv));});
8311             data.start_class = NULL;
8312         }
8313
8314         /* A temporary algorithm prefers floated substr to fixed one of
8315          * same length to dig more info. */
8316         i = (longest_length[0] <= longest_length[1]);
8317         RExC_rx->substrs->check_ix = i;
8318         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8319         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8320         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8321         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8322         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8323         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8324             RExC_rx->intflags |= PREGf_NOSCAN;
8325
8326         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8327             RExC_rx->extflags |= RXf_USE_INTUIT;
8328             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8329                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8330         }
8331
8332         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8333         if ( (STRLEN)minlen < longest_length[1] )
8334             minlen= longest_length[1];
8335         if ( (STRLEN)minlen < longest_length[0] )
8336             minlen= longest_length[0];
8337         */
8338     }
8339     else {
8340         /* Several toplevels. Best we can is to set minlen. */
8341         SSize_t fake;
8342         regnode_ssc ch_class;
8343         SSize_t last_close = 0;
8344
8345         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8346
8347         scan = RExC_rxi->program + 1;
8348         ssc_init(pRExC_state, &ch_class);
8349         data.start_class = &ch_class;
8350         data.last_closep = &last_close;
8351
8352         DEBUG_RExC_seen();
8353         /*
8354          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8355          * (patterns WITH top level branches)
8356          */
8357         minlen = study_chunk(pRExC_state,
8358             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8359             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8360                                                       ? SCF_TRIE_DOING_RESTUDY
8361                                                       : 0),
8362             0, TRUE);
8363
8364         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8365
8366         RExC_rx->check_substr = NULL;
8367         RExC_rx->check_utf8 = NULL;
8368         RExC_rx->substrs->data[0].substr      = NULL;
8369         RExC_rx->substrs->data[0].utf8_substr = NULL;
8370         RExC_rx->substrs->data[1].substr      = NULL;
8371         RExC_rx->substrs->data[1].utf8_substr = NULL;
8372
8373         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8374             && is_ssc_worth_it(pRExC_state, data.start_class))
8375         {
8376             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8377
8378             ssc_finalize(pRExC_state, data.start_class);
8379
8380             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8381             StructCopy(data.start_class,
8382                        (regnode_ssc*)RExC_rxi->data->data[n],
8383                        regnode_ssc);
8384             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8385             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8386             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8387                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8388                       Perl_re_printf( aTHX_
8389                                     "synthetic stclass \"%s\".\n",
8390                                     SvPVX_const(sv));});
8391             data.start_class = NULL;
8392         }
8393     }
8394
8395     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8396         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8397         RExC_rx->maxlen = REG_INFTY;
8398     }
8399     else {
8400         RExC_rx->maxlen = RExC_maxlen;
8401     }
8402
8403     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8404        the "real" pattern. */
8405     DEBUG_OPTIMISE_r({
8406         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8407                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8408     });
8409     RExC_rx->minlenret = minlen;
8410     if (RExC_rx->minlen < minlen)
8411         RExC_rx->minlen = minlen;
8412
8413     if (RExC_seen & REG_RECURSE_SEEN ) {
8414         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8415         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8416     }
8417     if (RExC_seen & REG_GPOS_SEEN)
8418         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8419     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8420         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8421                                                 lookbehind */
8422     if (pRExC_state->code_blocks)
8423         RExC_rx->extflags |= RXf_EVAL_SEEN;
8424     if (RExC_seen & REG_VERBARG_SEEN)
8425     {
8426         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8427         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8428     }
8429     if (RExC_seen & REG_CUTGROUP_SEEN)
8430         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8431     if (pm_flags & PMf_USE_RE_EVAL)
8432         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8433     if (RExC_paren_names)
8434         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8435     else
8436         RXp_PAREN_NAMES(RExC_rx) = NULL;
8437
8438     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8439      * so it can be used in pp.c */
8440     if (RExC_rx->intflags & PREGf_ANCH)
8441         RExC_rx->extflags |= RXf_IS_ANCHORED;
8442
8443
8444     {
8445         /* this is used to identify "special" patterns that might result
8446          * in Perl NOT calling the regex engine and instead doing the match "itself",
8447          * particularly special cases in split//. By having the regex compiler
8448          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8449          * we avoid weird issues with equivalent patterns resulting in different behavior,
8450          * AND we allow non Perl engines to get the same optimizations by the setting the
8451          * flags appropriately - Yves */
8452         regnode *first = RExC_rxi->program + 1;
8453         U8 fop = OP(first);
8454         regnode *next = regnext(first);
8455         U8 nop = OP(next);
8456
8457         if (PL_regkind[fop] == NOTHING && nop == END)
8458             RExC_rx->extflags |= RXf_NULL;
8459         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8460             /* when fop is SBOL first->flags will be true only when it was
8461              * produced by parsing /\A/, and not when parsing /^/. This is
8462              * very important for the split code as there we want to
8463              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8464              * See rt #122761 for more details. -- Yves */
8465             RExC_rx->extflags |= RXf_START_ONLY;
8466         else if (fop == PLUS
8467                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8468                  && nop == END)
8469             RExC_rx->extflags |= RXf_WHITE;
8470         else if ( RExC_rx->extflags & RXf_SPLIT
8471                   && (   fop == EXACT || fop == LEXACT
8472                       || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8473                       || fop == EXACTL)
8474                   && STR_LEN(first) == 1
8475                   && *(STRING(first)) == ' '
8476                   && nop == END )
8477             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8478
8479     }
8480
8481     if (RExC_contains_locale) {
8482         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8483     }
8484
8485 #ifdef DEBUGGING
8486     if (RExC_paren_names) {
8487         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8488         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8489                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8490     } else
8491 #endif
8492     RExC_rxi->name_list_idx = 0;
8493
8494     while ( RExC_recurse_count > 0 ) {
8495         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8496         /*
8497          * This data structure is set up in study_chunk() and is used
8498          * to calculate the distance between a GOSUB regopcode and
8499          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8500          * it refers to.
8501          *
8502          * If for some reason someone writes code that optimises
8503          * away a GOSUB opcode then the assert should be changed to
8504          * an if(scan) to guard the ARG2L_SET() - Yves
8505          *
8506          */
8507         assert(scan && OP(scan) == GOSUB);
8508         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8509     }
8510
8511     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8512     /* assume we don't need to swap parens around before we match */
8513     DEBUG_TEST_r({
8514         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8515             (unsigned long)RExC_study_chunk_recursed_count);
8516     });
8517     DEBUG_DUMP_r({
8518         DEBUG_RExC_seen();
8519         Perl_re_printf( aTHX_ "Final program:\n");
8520         regdump(RExC_rx);
8521     });
8522
8523     if (RExC_open_parens) {
8524         Safefree(RExC_open_parens);
8525         RExC_open_parens = NULL;
8526     }
8527     if (RExC_close_parens) {
8528         Safefree(RExC_close_parens);
8529         RExC_close_parens = NULL;
8530     }
8531
8532 #ifdef USE_ITHREADS
8533     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8534      * by setting the regexp SV to readonly-only instead. If the
8535      * pattern's been recompiled, the USEDness should remain. */
8536     if (old_re && SvREADONLY(old_re))
8537         SvREADONLY_on(Rx);
8538 #endif
8539     return Rx;
8540 }
8541
8542
8543 SV*
8544 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8545                     const U32 flags)
8546 {
8547     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8548
8549     PERL_UNUSED_ARG(value);
8550
8551     if (flags & RXapif_FETCH) {
8552         return reg_named_buff_fetch(rx, key, flags);
8553     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8554         Perl_croak_no_modify();
8555         return NULL;
8556     } else if (flags & RXapif_EXISTS) {
8557         return reg_named_buff_exists(rx, key, flags)
8558             ? &PL_sv_yes
8559             : &PL_sv_no;
8560     } else if (flags & RXapif_REGNAMES) {
8561         return reg_named_buff_all(rx, flags);
8562     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8563         return reg_named_buff_scalar(rx, flags);
8564     } else {
8565         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8566         return NULL;
8567     }
8568 }
8569
8570 SV*
8571 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8572                          const U32 flags)
8573 {
8574     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8575     PERL_UNUSED_ARG(lastkey);
8576
8577     if (flags & RXapif_FIRSTKEY)
8578         return reg_named_buff_firstkey(rx, flags);
8579     else if (flags & RXapif_NEXTKEY)
8580         return reg_named_buff_nextkey(rx, flags);
8581     else {
8582         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8583                                             (int)flags);
8584         return NULL;
8585     }
8586 }
8587
8588 SV*
8589 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8590                           const U32 flags)
8591 {
8592     SV *ret;
8593     struct regexp *const rx = ReANY(r);
8594
8595     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8596
8597     if (rx && RXp_PAREN_NAMES(rx)) {
8598         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8599         if (he_str) {
8600             IV i;
8601             SV* sv_dat=HeVAL(he_str);
8602             I32 *nums=(I32*)SvPVX(sv_dat);
8603             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8604             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8605                 if ((I32)(rx->nparens) >= nums[i]
8606                     && rx->offs[nums[i]].start != -1
8607                     && rx->offs[nums[i]].end != -1)
8608                 {
8609                     ret = newSVpvs("");
8610                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8611                     if (!retarray)
8612                         return ret;
8613                 } else {
8614                     if (retarray)
8615                         ret = newSVsv(&PL_sv_undef);
8616                 }
8617                 if (retarray)
8618                     av_push(retarray, ret);
8619             }
8620             if (retarray)
8621                 return newRV_noinc(MUTABLE_SV(retarray));
8622         }
8623     }
8624     return NULL;
8625 }
8626
8627 bool
8628 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8629                            const U32 flags)
8630 {
8631     struct regexp *const rx = ReANY(r);
8632
8633     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8634
8635     if (rx && RXp_PAREN_NAMES(rx)) {
8636         if (flags & RXapif_ALL) {
8637             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8638         } else {
8639             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8640             if (sv) {
8641                 SvREFCNT_dec_NN(sv);
8642                 return TRUE;
8643             } else {
8644                 return FALSE;
8645             }
8646         }
8647     } else {
8648         return FALSE;
8649     }
8650 }
8651
8652 SV*
8653 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8654 {
8655     struct regexp *const rx = ReANY(r);
8656
8657     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8658
8659     if ( rx && RXp_PAREN_NAMES(rx) ) {
8660         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8661
8662         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8663     } else {
8664         return FALSE;
8665     }
8666 }
8667
8668 SV*
8669 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8670 {
8671     struct regexp *const rx = ReANY(r);
8672     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8673
8674     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8675
8676     if (rx && RXp_PAREN_NAMES(rx)) {
8677         HV *hv = RXp_PAREN_NAMES(rx);
8678         HE *temphe;
8679         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8680             IV i;
8681             IV parno = 0;
8682             SV* sv_dat = HeVAL(temphe);
8683             I32 *nums = (I32*)SvPVX(sv_dat);
8684             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8685                 if ((I32)(rx->lastparen) >= nums[i] &&
8686                     rx->offs[nums[i]].start != -1 &&
8687                     rx->offs[nums[i]].end != -1)
8688                 {
8689                     parno = nums[i];
8690                     break;
8691                 }
8692             }
8693             if (parno || flags & RXapif_ALL) {
8694                 return newSVhek(HeKEY_hek(temphe));
8695             }
8696         }
8697     }
8698     return NULL;
8699 }
8700
8701 SV*
8702 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8703 {
8704     SV *ret;
8705     AV *av;
8706     SSize_t length;
8707     struct regexp *const rx = ReANY(r);
8708
8709     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8710
8711     if (rx && RXp_PAREN_NAMES(rx)) {
8712         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8713             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8714         } else if (flags & RXapif_ONE) {
8715             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8716             av = MUTABLE_AV(SvRV(ret));
8717             length = av_count(av);
8718             SvREFCNT_dec_NN(ret);
8719             return newSViv(length);
8720         } else {
8721             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8722                                                 (int)flags);
8723             return NULL;
8724         }
8725     }
8726     return &PL_sv_undef;
8727 }
8728
8729 SV*
8730 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8731 {
8732     struct regexp *const rx = ReANY(r);
8733     AV *av = newAV();
8734
8735     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8736
8737     if (rx && RXp_PAREN_NAMES(rx)) {
8738         HV *hv= RXp_PAREN_NAMES(rx);
8739         HE *temphe;
8740         (void)hv_iterinit(hv);
8741         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8742             IV i;
8743             IV parno = 0;
8744             SV* sv_dat = HeVAL(temphe);
8745             I32 *nums = (I32*)SvPVX(sv_dat);
8746             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8747                 if ((I32)(rx->lastparen) >= nums[i] &&
8748                     rx->offs[nums[i]].start != -1 &&
8749                     rx->offs[nums[i]].end != -1)
8750                 {
8751                     parno = nums[i];
8752                     break;
8753                 }
8754             }
8755             if (parno || flags & RXapif_ALL) {
8756                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8757             }
8758         }
8759     }
8760
8761     return newRV_noinc(MUTABLE_SV(av));
8762 }
8763
8764 void
8765 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8766                              SV * const sv)
8767 {
8768     struct regexp *const rx = ReANY(r);
8769     char *s = NULL;
8770     SSize_t i = 0;
8771     SSize_t s1, t1;
8772     I32 n = paren;
8773
8774     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8775
8776     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8777            || n == RX_BUFF_IDX_CARET_FULLMATCH
8778            || n == RX_BUFF_IDX_CARET_POSTMATCH
8779        )
8780     {
8781         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8782         if (!keepcopy) {
8783             /* on something like
8784              *    $r = qr/.../;
8785              *    /$qr/p;
8786              * the KEEPCOPY is set on the PMOP rather than the regex */
8787             if (PL_curpm && r == PM_GETRE(PL_curpm))
8788                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8789         }
8790         if (!keepcopy)
8791             goto ret_undef;
8792     }
8793
8794     if (!rx->subbeg)
8795         goto ret_undef;
8796
8797     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8798         /* no need to distinguish between them any more */
8799         n = RX_BUFF_IDX_FULLMATCH;
8800
8801     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8802         && rx->offs[0].start != -1)
8803     {
8804         /* $`, ${^PREMATCH} */
8805         i = rx->offs[0].start;
8806         s = rx->subbeg;
8807     }
8808     else
8809     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8810         && rx->offs[0].end != -1)
8811     {
8812         /* $', ${^POSTMATCH} */
8813         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8814         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8815     }
8816     else
8817     if (inRANGE(n, 0, (I32)rx->nparens) &&
8818         (s1 = rx->offs[n].start) != -1  &&
8819         (t1 = rx->offs[n].end) != -1)
8820     {
8821         /* $&, ${^MATCH},  $1 ... */
8822         i = t1 - s1;
8823         s = rx->subbeg + s1 - rx->suboffset;
8824     } else {
8825         goto ret_undef;
8826     }
8827
8828     assert(s >= rx->subbeg);
8829     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8830     if (i >= 0) {
8831 #ifdef NO_TAINT_SUPPORT
8832         sv_setpvn(sv, s, i);
8833 #else
8834         const int oldtainted = TAINT_get;
8835         TAINT_NOT;
8836         sv_setpvn(sv, s, i);
8837         TAINT_set(oldtainted);
8838 #endif
8839         if (RXp_MATCH_UTF8(rx))
8840             SvUTF8_on(sv);
8841         else
8842             SvUTF8_off(sv);
8843         if (TAINTING_get) {
8844             if (RXp_MATCH_TAINTED(rx)) {
8845                 if (SvTYPE(sv) >= SVt_PVMG) {
8846                     MAGIC* const mg = SvMAGIC(sv);
8847                     MAGIC* mgt;
8848                     TAINT;
8849                     SvMAGIC_set(sv, mg->mg_moremagic);
8850                     SvTAINT(sv);
8851                     if ((mgt = SvMAGIC(sv))) {
8852                         mg->mg_moremagic = mgt;
8853                         SvMAGIC_set(sv, mg);
8854                     }
8855                 } else {
8856                     TAINT;
8857                     SvTAINT(sv);
8858                 }
8859             } else
8860                 SvTAINTED_off(sv);
8861         }
8862     } else {
8863       ret_undef:
8864         sv_set_undef(sv);
8865         return;
8866     }
8867 }
8868
8869 void
8870 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8871                                                          SV const * const value)
8872 {
8873     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8874
8875     PERL_UNUSED_ARG(rx);
8876     PERL_UNUSED_ARG(paren);
8877     PERL_UNUSED_ARG(value);
8878
8879     if (!PL_localizing)
8880         Perl_croak_no_modify();
8881 }
8882
8883 I32
8884 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8885                               const I32 paren)
8886 {
8887     struct regexp *const rx = ReANY(r);
8888     I32 i;
8889     I32 s1, t1;
8890
8891     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8892
8893     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8894         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8895         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8896     )
8897     {
8898         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8899         if (!keepcopy) {
8900             /* on something like
8901              *    $r = qr/.../;
8902              *    /$qr/p;
8903              * the KEEPCOPY is set on the PMOP rather than the regex */
8904             if (PL_curpm && r == PM_GETRE(PL_curpm))
8905                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8906         }
8907         if (!keepcopy)
8908             goto warn_undef;
8909     }
8910
8911     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8912     switch (paren) {
8913       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8914       case RX_BUFF_IDX_PREMATCH:       /* $` */
8915         if (rx->offs[0].start != -1) {
8916                         i = rx->offs[0].start;
8917                         if (i > 0) {
8918                                 s1 = 0;
8919                                 t1 = i;
8920                                 goto getlen;
8921                         }
8922             }
8923         return 0;
8924
8925       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8926       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8927             if (rx->offs[0].end != -1) {
8928                         i = rx->sublen - rx->offs[0].end;
8929                         if (i > 0) {
8930                                 s1 = rx->offs[0].end;
8931                                 t1 = rx->sublen;
8932                                 goto getlen;
8933                         }
8934             }
8935         return 0;
8936
8937       default: /* $& / ${^MATCH}, $1, $2, ... */
8938             if (paren <= (I32)rx->nparens &&
8939             (s1 = rx->offs[paren].start) != -1 &&
8940             (t1 = rx->offs[paren].end) != -1)
8941             {
8942             i = t1 - s1;
8943             goto getlen;
8944         } else {
8945           warn_undef:
8946             if (ckWARN(WARN_UNINITIALIZED))
8947                 report_uninit((const SV *)sv);
8948             return 0;
8949         }
8950     }
8951   getlen:
8952     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8953         const char * const s = rx->subbeg - rx->suboffset + s1;
8954         const U8 *ep;
8955         STRLEN el;
8956
8957         i = t1 - s1;
8958         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8959             i = el;
8960     }
8961     return i;
8962 }
8963
8964 SV*
8965 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8966 {
8967     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8968         PERL_UNUSED_ARG(rx);
8969         if (0)
8970             return NULL;
8971         else
8972             return newSVpvs("Regexp");
8973 }
8974
8975 /* Scans the name of a named buffer from the pattern.
8976  * If flags is REG_RSN_RETURN_NULL returns null.
8977  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8978  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8979  * to the parsed name as looked up in the RExC_paren_names hash.
8980  * If there is an error throws a vFAIL().. type exception.
8981  */
8982
8983 #define REG_RSN_RETURN_NULL    0
8984 #define REG_RSN_RETURN_NAME    1
8985 #define REG_RSN_RETURN_DATA    2
8986
8987 STATIC SV*
8988 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8989 {
8990     char *name_start = RExC_parse;
8991     SV* sv_name;
8992
8993     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8994
8995     assert (RExC_parse <= RExC_end);
8996     if (RExC_parse == RExC_end) NOOP;
8997     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8998          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8999           * using do...while */
9000         if (UTF)
9001             do {
9002                 RExC_parse += UTF8SKIP(RExC_parse);
9003             } while (   RExC_parse < RExC_end
9004                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
9005         else
9006             do {
9007                 RExC_parse++;
9008             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
9009     } else {
9010         RExC_parse++; /* so the <- from the vFAIL is after the offending
9011                          character */
9012         vFAIL("Group name must start with a non-digit word character");
9013     }
9014     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9015                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9016     if ( flags == REG_RSN_RETURN_NAME)
9017         return sv_name;
9018     else if (flags==REG_RSN_RETURN_DATA) {
9019         HE *he_str = NULL;
9020         SV *sv_dat = NULL;
9021         if ( ! sv_name )      /* should not happen*/
9022             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9023         if (RExC_paren_names)
9024             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9025         if ( he_str )
9026             sv_dat = HeVAL(he_str);
9027         if ( ! sv_dat ) {   /* Didn't find group */
9028
9029             /* It might be a forward reference; we can't fail until we
9030                 * know, by completing the parse to get all the groups, and
9031                 * then reparsing */
9032             if (ALL_PARENS_COUNTED)  {
9033                 vFAIL("Reference to nonexistent named group");
9034             }
9035             else {
9036                 REQUIRE_PARENS_PASS;
9037             }
9038         }
9039         return sv_dat;
9040     }
9041
9042     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9043                      (unsigned long) flags);
9044 }
9045
9046 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
9047     if (RExC_lastparse!=RExC_parse) {                           \
9048         Perl_re_printf( aTHX_  "%s",                            \
9049             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
9050                 RExC_end - RExC_parse, 16,                      \
9051                 "", "",                                         \
9052                 PERL_PV_ESCAPE_UNI_DETECT |                     \
9053                 PERL_PV_PRETTY_ELLIPSES   |                     \
9054                 PERL_PV_PRETTY_LTGT       |                     \
9055                 PERL_PV_ESCAPE_RE         |                     \
9056                 PERL_PV_PRETTY_EXACTSIZE                        \
9057             )                                                   \
9058         );                                                      \
9059     } else                                                      \
9060         Perl_re_printf( aTHX_ "%16s","");                       \
9061                                                                 \
9062     if (RExC_lastnum!=RExC_emit)                                \
9063        Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
9064     else                                                        \
9065        Perl_re_printf( aTHX_ "|%4s","");                        \
9066     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
9067         (int)((depth*2)), "",                                   \
9068         (funcname)                                              \
9069     );                                                          \
9070     RExC_lastnum=RExC_emit;                                     \
9071     RExC_lastparse=RExC_parse;                                  \
9072 })
9073
9074
9075
9076 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
9077     DEBUG_PARSE_MSG((funcname));                            \
9078     Perl_re_printf( aTHX_ "%4s","\n");                                  \
9079 })
9080 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
9081     DEBUG_PARSE_MSG((funcname));                            \
9082     Perl_re_printf( aTHX_ fmt "\n",args);                               \
9083 })
9084
9085 /* This section of code defines the inversion list object and its methods.  The
9086  * interfaces are highly subject to change, so as much as possible is static to
9087  * this file.  An inversion list is here implemented as a malloc'd C UV array
9088  * as an SVt_INVLIST scalar.
9089  *
9090  * An inversion list for Unicode is an array of code points, sorted by ordinal
9091  * number.  Each element gives the code point that begins a range that extends
9092  * up-to but not including the code point given by the next element.  The final
9093  * element gives the first code point of a range that extends to the platform's
9094  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
9095  * ...) give ranges whose code points are all in the inversion list.  We say
9096  * that those ranges are in the set.  The odd-numbered elements give ranges
9097  * whose code points are not in the inversion list, and hence not in the set.
9098  * Thus, element [0] is the first code point in the list.  Element [1]
9099  * is the first code point beyond that not in the list; and element [2] is the
9100  * first code point beyond that that is in the list.  In other words, the first
9101  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9102  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
9103  * all code points in that range are not in the inversion list.  The third
9104  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9105  * list, and so forth.  Thus every element whose index is divisible by two
9106  * gives the beginning of a range that is in the list, and every element whose
9107  * index is not divisible by two gives the beginning of a range not in the
9108  * list.  If the final element's index is divisible by two, the inversion list
9109  * extends to the platform's infinity; otherwise the highest code point in the
9110  * inversion list is the contents of that element minus 1.
9111  *
9112  * A range that contains just a single code point N will look like
9113  *  invlist[i]   == N
9114  *  invlist[i+1] == N+1
9115  *
9116  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9117  * impossible to represent, so element [i+1] is omitted.  The single element
9118  * inversion list
9119  *  invlist[0] == UV_MAX
9120  * contains just UV_MAX, but is interpreted as matching to infinity.
9121  *
9122  * Taking the complement (inverting) an inversion list is quite simple, if the
9123  * first element is 0, remove it; otherwise add a 0 element at the beginning.
9124  * This implementation reserves an element at the beginning of each inversion
9125  * list to always contain 0; there is an additional flag in the header which
9126  * indicates if the list begins at the 0, or is offset to begin at the next
9127  * element.  This means that the inversion list can be inverted without any
9128  * copying; just flip the flag.
9129  *
9130  * More about inversion lists can be found in "Unicode Demystified"
9131  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9132  *
9133  * The inversion list data structure is currently implemented as an SV pointing
9134  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
9135  * array of UV whose memory management is automatically handled by the existing
9136  * facilities for SV's.
9137  *
9138  * Some of the methods should always be private to the implementation, and some
9139  * should eventually be made public */
9140
9141 /* The header definitions are in F<invlist_inline.h> */
9142
9143 #ifndef PERL_IN_XSUB_RE
9144
9145 PERL_STATIC_INLINE UV*
9146 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9147 {
9148     /* Returns a pointer to the first element in the inversion list's array.
9149      * This is called upon initialization of an inversion list.  Where the
9150      * array begins depends on whether the list has the code point U+0000 in it
9151      * or not.  The other parameter tells it whether the code that follows this
9152      * call is about to put a 0 in the inversion list or not.  The first
9153      * element is either the element reserved for 0, if TRUE, or the element
9154      * after it, if FALSE */
9155
9156     bool* offset = get_invlist_offset_addr(invlist);
9157     UV* zero_addr = (UV *) SvPVX(invlist);
9158
9159     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9160
9161     /* Must be empty */
9162     assert(! _invlist_len(invlist));
9163
9164     *zero_addr = 0;
9165
9166     /* 1^1 = 0; 1^0 = 1 */
9167     *offset = 1 ^ will_have_0;
9168     return zero_addr + *offset;
9169 }
9170
9171 STATIC void
9172 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9173 {
9174     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9175      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9176      * is similar to what SvSetMagicSV() would do, if it were implemented on
9177      * inversion lists, though this routine avoids a copy */
9178
9179     const UV src_len          = _invlist_len(src);
9180     const bool src_offset     = *get_invlist_offset_addr(src);
9181     const STRLEN src_byte_len = SvLEN(src);
9182     char * array              = SvPVX(src);
9183
9184     const int oldtainted = TAINT_get;
9185
9186     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9187
9188     assert(is_invlist(src));
9189     assert(is_invlist(dest));
9190     assert(! invlist_is_iterating(src));
9191     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9192
9193     /* Make sure it ends in the right place with a NUL, as our inversion list
9194      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9195      * asserts it */
9196     array[src_byte_len - 1] = '\0';
9197
9198     TAINT_NOT;      /* Otherwise it breaks */
9199     sv_usepvn_flags(dest,
9200                     (char *) array,
9201                     src_byte_len - 1,
9202
9203                     /* This flag is documented to cause a copy to be avoided */
9204                     SV_HAS_TRAILING_NUL);
9205     TAINT_set(oldtainted);
9206     SvPV_set(src, 0);
9207     SvLEN_set(src, 0);
9208     SvCUR_set(src, 0);
9209
9210     /* Finish up copying over the other fields in an inversion list */
9211     *get_invlist_offset_addr(dest) = src_offset;
9212     invlist_set_len(dest, src_len, src_offset);
9213     *get_invlist_previous_index_addr(dest) = 0;
9214     invlist_iterfinish(dest);
9215 }
9216
9217 PERL_STATIC_INLINE IV*
9218 S_get_invlist_previous_index_addr(SV* invlist)
9219 {
9220     /* Return the address of the IV that is reserved to hold the cached index
9221      * */
9222     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9223
9224     assert(is_invlist(invlist));
9225
9226     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9227 }
9228
9229 PERL_STATIC_INLINE IV
9230 S_invlist_previous_index(SV* const invlist)
9231 {
9232     /* Returns cached index of previous search */
9233
9234     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9235
9236     return *get_invlist_previous_index_addr(invlist);
9237 }
9238
9239 PERL_STATIC_INLINE void
9240 S_invlist_set_previous_index(SV* const invlist, const IV index)
9241 {
9242     /* Caches <index> for later retrieval */
9243
9244     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9245
9246     assert(index == 0 || index < (int) _invlist_len(invlist));
9247
9248     *get_invlist_previous_index_addr(invlist) = index;
9249 }
9250
9251 PERL_STATIC_INLINE void
9252 S_invlist_trim(SV* invlist)
9253 {
9254     /* Free the not currently-being-used space in an inversion list */
9255
9256     /* But don't free up the space needed for the 0 UV that is always at the
9257      * beginning of the list, nor the trailing NUL */
9258     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9259
9260     PERL_ARGS_ASSERT_INVLIST_TRIM;
9261
9262     assert(is_invlist(invlist));
9263
9264     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9265 }
9266
9267 PERL_STATIC_INLINE void
9268 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9269 {
9270     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9271
9272     assert(is_invlist(invlist));
9273
9274     invlist_set_len(invlist, 0, 0);
9275     invlist_trim(invlist);
9276 }
9277
9278 #endif /* ifndef PERL_IN_XSUB_RE */
9279
9280 PERL_STATIC_INLINE bool
9281 S_invlist_is_iterating(SV* const invlist)
9282 {
9283     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9284
9285     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9286 }
9287
9288 #ifndef PERL_IN_XSUB_RE
9289
9290 PERL_STATIC_INLINE UV
9291 S_invlist_max(SV* const invlist)
9292 {
9293     /* Returns the maximum number of elements storable in the inversion list's
9294      * array, without having to realloc() */
9295
9296     PERL_ARGS_ASSERT_INVLIST_MAX;
9297
9298     assert(is_invlist(invlist));
9299
9300     /* Assumes worst case, in which the 0 element is not counted in the
9301      * inversion list, so subtracts 1 for that */
9302     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9303            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9304            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9305 }
9306
9307 STATIC void
9308 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9309 {
9310     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9311
9312     /* First 1 is in case the zero element isn't in the list; second 1 is for
9313      * trailing NUL */
9314     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9315     invlist_set_len(invlist, 0, 0);
9316
9317     /* Force iterinit() to be used to get iteration to work */
9318     invlist_iterfinish(invlist);
9319
9320     *get_invlist_previous_index_addr(invlist) = 0;
9321     SvPOK_on(invlist);  /* This allows B to extract the PV */
9322 }
9323
9324 SV*
9325 Perl__new_invlist(pTHX_ IV initial_size)
9326 {
9327
9328     /* Return a pointer to a newly constructed inversion list, with enough
9329      * space to store 'initial_size' elements.  If that number is negative, a
9330      * system default is used instead */
9331
9332     SV* new_list;
9333
9334     if (initial_size < 0) {
9335         initial_size = 10;
9336     }
9337
9338     new_list = newSV_type(SVt_INVLIST);
9339     initialize_invlist_guts(new_list, initial_size);
9340
9341     return new_list;
9342 }
9343
9344 SV*
9345 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9346 {
9347     /* Return a pointer to a newly constructed inversion list, initialized to
9348      * point to <list>, which has to be in the exact correct inversion list
9349      * form, including internal fields.  Thus this is a dangerous routine that
9350      * should not be used in the wrong hands.  The passed in 'list' contains
9351      * several header fields at the beginning that are not part of the
9352      * inversion list body proper */
9353
9354     const STRLEN length = (STRLEN) list[0];
9355     const UV version_id =          list[1];
9356     const bool offset   =    cBOOL(list[2]);
9357 #define HEADER_LENGTH 3
9358     /* If any of the above changes in any way, you must change HEADER_LENGTH
9359      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9360      *      perl -E 'say int(rand 2**31-1)'
9361      */
9362 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9363                                         data structure type, so that one being
9364                                         passed in can be validated to be an
9365                                         inversion list of the correct vintage.
9366                                        */
9367
9368     SV* invlist = newSV_type(SVt_INVLIST);
9369
9370     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9371
9372     if (version_id != INVLIST_VERSION_ID) {
9373         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9374     }
9375
9376     /* The generated array passed in includes header elements that aren't part
9377      * of the list proper, so start it just after them */
9378     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9379
9380     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9381                                shouldn't touch it */
9382
9383     *(get_invlist_offset_addr(invlist)) = offset;
9384
9385     /* The 'length' passed to us is the physical number of elements in the
9386      * inversion list.  But if there is an offset the logical number is one
9387      * less than that */
9388     invlist_set_len(invlist, length  - offset, offset);
9389
9390     invlist_set_previous_index(invlist, 0);
9391
9392     /* Initialize the iteration pointer. */
9393     invlist_iterfinish(invlist);
9394
9395     SvREADONLY_on(invlist);
9396     SvPOK_on(invlist);
9397
9398     return invlist;
9399 }
9400
9401 STATIC void
9402 S__append_range_to_invlist(pTHX_ SV* const invlist,
9403                                  const UV start, const UV end)
9404 {
9405    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9406     * the end of the inversion list.  The range must be above any existing
9407     * ones. */
9408
9409     UV* array;
9410     UV max = invlist_max(invlist);
9411     UV len = _invlist_len(invlist);
9412     bool offset;
9413
9414     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9415
9416     if (len == 0) { /* Empty lists must be initialized */
9417         offset = start != 0;
9418         array = _invlist_array_init(invlist, ! offset);
9419     }
9420     else {
9421         /* Here, the existing list is non-empty. The current max entry in the
9422          * list is generally the first value not in the set, except when the
9423          * set extends to the end of permissible values, in which case it is
9424          * the first entry in that final set, and so this call is an attempt to
9425          * append out-of-order */
9426
9427         UV final_element = len - 1;
9428         array = invlist_array(invlist);
9429         if (   array[final_element] > start
9430             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9431         {
9432             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9433                      array[final_element], start,
9434                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9435         }
9436
9437         /* Here, it is a legal append.  If the new range begins 1 above the end
9438          * of the range below it, it is extending the range below it, so the
9439          * new first value not in the set is one greater than the newly
9440          * extended range.  */
9441         offset = *get_invlist_offset_addr(invlist);
9442         if (array[final_element] == start) {
9443             if (end != UV_MAX) {
9444                 array[final_element] = end + 1;
9445             }
9446             else {
9447                 /* But if the end is the maximum representable on the machine,
9448                  * assume that infinity was actually what was meant.  Just let
9449                  * the range that this would extend to have no end */
9450                 invlist_set_len(invlist, len - 1, offset);
9451             }
9452             return;
9453         }
9454     }
9455
9456     /* Here the new range doesn't extend any existing set.  Add it */
9457
9458     len += 2;   /* Includes an element each for the start and end of range */
9459
9460     /* If wll overflow the existing space, extend, which may cause the array to
9461      * be moved */
9462     if (max < len) {
9463         invlist_extend(invlist, len);
9464
9465         /* Have to set len here to avoid assert failure in invlist_array() */
9466         invlist_set_len(invlist, len, offset);
9467
9468         array = invlist_array(invlist);
9469     }
9470     else {
9471         invlist_set_len(invlist, len, offset);
9472     }
9473
9474     /* The next item on the list starts the range, the one after that is
9475      * one past the new range.  */
9476     array[len - 2] = start;
9477     if (end != UV_MAX) {
9478         array[len - 1] = end + 1;
9479     }
9480     else {
9481         /* But if the end is the maximum representable on the machine, just let
9482          * the range have no end */
9483         invlist_set_len(invlist, len - 1, offset);
9484     }
9485 }
9486
9487 SSize_t
9488 Perl__invlist_search(SV* const invlist, const UV cp)
9489 {
9490     /* Searches the inversion list for the entry that contains the input code
9491      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9492      * return value is the index into the list's array of the range that
9493      * contains <cp>, that is, 'i' such that
9494      *  array[i] <= cp < array[i+1]
9495      */
9496
9497     IV low = 0;
9498     IV mid;
9499     IV high = _invlist_len(invlist);
9500     const IV highest_element = high - 1;
9501     const UV* array;
9502
9503     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9504
9505     /* If list is empty, return failure. */
9506     if (high == 0) {
9507         return -1;
9508     }
9509
9510     /* (We can't get the array unless we know the list is non-empty) */
9511     array = invlist_array(invlist);
9512
9513     mid = invlist_previous_index(invlist);
9514     assert(mid >=0);
9515     if (mid > highest_element) {
9516         mid = highest_element;
9517     }
9518
9519     /* <mid> contains the cache of the result of the previous call to this
9520      * function (0 the first time).  See if this call is for the same result,
9521      * or if it is for mid-1.  This is under the theory that calls to this
9522      * function will often be for related code points that are near each other.
9523      * And benchmarks show that caching gives better results.  We also test
9524      * here if the code point is within the bounds of the list.  These tests
9525      * replace others that would have had to be made anyway to make sure that
9526      * the array bounds were not exceeded, and these give us extra information
9527      * at the same time */
9528     if (cp >= array[mid]) {
9529         if (cp >= array[highest_element]) {
9530             return highest_element;
9531         }
9532
9533         /* Here, array[mid] <= cp < array[highest_element].  This means that
9534          * the final element is not the answer, so can exclude it; it also
9535          * means that <mid> is not the final element, so can refer to 'mid + 1'
9536          * safely */
9537         if (cp < array[mid + 1]) {
9538             return mid;
9539         }
9540         high--;
9541         low = mid + 1;
9542     }
9543     else { /* cp < aray[mid] */
9544         if (cp < array[0]) { /* Fail if outside the array */
9545             return -1;
9546         }
9547         high = mid;
9548         if (cp >= array[mid - 1]) {
9549             goto found_entry;
9550         }
9551     }
9552
9553     /* Binary search.  What we are looking for is <i> such that
9554      *  array[i] <= cp < array[i+1]
9555      * The loop below converges on the i+1.  Note that there may not be an
9556      * (i+1)th element in the array, and things work nonetheless */
9557     while (low < high) {
9558         mid = (low + high) / 2;
9559         assert(mid <= highest_element);
9560         if (array[mid] <= cp) { /* cp >= array[mid] */
9561             low = mid + 1;
9562
9563             /* We could do this extra test to exit the loop early.
9564             if (cp < array[low]) {
9565                 return mid;
9566             }
9567             */
9568         }
9569         else { /* cp < array[mid] */
9570             high = mid;
9571         }
9572     }
9573
9574   found_entry:
9575     high--;
9576     invlist_set_previous_index(invlist, high);
9577     return high;
9578 }
9579
9580 void
9581 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9582                                          const bool complement_b, SV** output)
9583 {
9584     /* Take the union of two inversion lists and point '*output' to it.  On
9585      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9586      * even 'a' or 'b').  If to an inversion list, the contents of the original
9587      * list will be replaced by the union.  The first list, 'a', may be
9588      * NULL, in which case a copy of the second list is placed in '*output'.
9589      * If 'complement_b' is TRUE, the union is taken of the complement
9590      * (inversion) of 'b' instead of b itself.
9591      *
9592      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9593      * Richard Gillam, published by Addison-Wesley, and explained at some
9594      * length there.  The preface says to incorporate its examples into your
9595      * code at your own risk.
9596      *
9597      * The algorithm is like a merge sort. */
9598
9599     const UV* array_a;    /* a's array */
9600     const UV* array_b;
9601     UV len_a;       /* length of a's array */
9602     UV len_b;
9603
9604     SV* u;                      /* the resulting union */
9605     UV* array_u;
9606     UV len_u = 0;
9607
9608     UV i_a = 0;             /* current index into a's array */
9609     UV i_b = 0;
9610     UV i_u = 0;
9611
9612     /* running count, as explained in the algorithm source book; items are
9613      * stopped accumulating and are output when the count changes to/from 0.
9614      * The count is incremented when we start a range that's in an input's set,
9615      * and decremented when we start a range that's not in a set.  So this
9616      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9617      * and hence nothing goes into the union; 1, just one of the inputs is in
9618      * its set (and its current range gets added to the union); and 2 when both
9619      * inputs are in their sets.  */
9620     UV count = 0;
9621
9622     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9623     assert(a != b);
9624     assert(*output == NULL || is_invlist(*output));
9625
9626     len_b = _invlist_len(b);
9627     if (len_b == 0) {
9628
9629         /* Here, 'b' is empty, hence it's complement is all possible code
9630          * points.  So if the union includes the complement of 'b', it includes
9631          * everything, and we need not even look at 'a'.  It's easiest to
9632          * create a new inversion list that matches everything.  */
9633         if (complement_b) {
9634             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9635
9636             if (*output == NULL) { /* If the output didn't exist, just point it
9637                                       at the new list */
9638                 *output = everything;
9639             }
9640             else { /* Otherwise, replace its contents with the new list */
9641                 invlist_replace_list_destroys_src(*output, everything);
9642                 SvREFCNT_dec_NN(everything);
9643             }
9644
9645             return;
9646         }
9647
9648         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9649          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9650          * output will be empty */
9651
9652         if (a == NULL || _invlist_len(a) == 0) {
9653             if (*output == NULL) {
9654                 *output = _new_invlist(0);
9655             }
9656             else {
9657                 invlist_clear(*output);
9658             }
9659             return;
9660         }
9661
9662         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9663          * union.  We can just return a copy of 'a' if '*output' doesn't point
9664          * to an existing list */
9665         if (*output == NULL) {
9666             *output = invlist_clone(a, NULL);
9667             return;
9668         }
9669
9670         /* If the output is to overwrite 'a', we have a no-op, as it's
9671          * already in 'a' */
9672         if (*output == a) {
9673             return;
9674         }
9675
9676         /* Here, '*output' is to be overwritten by 'a' */
9677         u = invlist_clone(a, NULL);
9678         invlist_replace_list_destroys_src(*output, u);
9679         SvREFCNT_dec_NN(u);
9680
9681         return;
9682     }
9683
9684     /* Here 'b' is not empty.  See about 'a' */
9685
9686     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9687
9688         /* Here, 'a' is empty (and b is not).  That means the union will come
9689          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9690          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9691          * the clone */
9692
9693         SV ** dest = (*output == NULL) ? output : &u;
9694         *dest = invlist_clone(b, NULL);
9695         if (complement_b) {
9696             _invlist_invert(*dest);
9697         }
9698
9699         if (dest == &u) {
9700             invlist_replace_list_destroys_src(*output, u);
9701             SvREFCNT_dec_NN(u);
9702         }
9703
9704         return;
9705     }
9706
9707     /* Here both lists exist and are non-empty */
9708     array_a = invlist_array(a);
9709     array_b = invlist_array(b);
9710
9711     /* If are to take the union of 'a' with the complement of b, set it
9712      * up so are looking at b's complement. */
9713     if (complement_b) {
9714
9715         /* To complement, we invert: if the first element is 0, remove it.  To
9716          * do this, we just pretend the array starts one later */
9717         if (array_b[0] == 0) {
9718             array_b++;
9719             len_b--;
9720         }
9721         else {
9722
9723             /* But if the first element is not zero, we pretend the list starts
9724              * at the 0 that is always stored immediately before the array. */
9725             array_b--;
9726             len_b++;
9727         }
9728     }
9729
9730     /* Size the union for the worst case: that the sets are completely
9731      * disjoint */
9732     u = _new_invlist(len_a + len_b);
9733
9734     /* Will contain U+0000 if either component does */
9735     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9736                                       || (len_b > 0 && array_b[0] == 0));
9737
9738     /* Go through each input list item by item, stopping when have exhausted
9739      * one of them */
9740     while (i_a < len_a && i_b < len_b) {
9741         UV cp;      /* The element to potentially add to the union's array */
9742         bool cp_in_set;   /* is it in the input list's set or not */
9743
9744         /* We need to take one or the other of the two inputs for the union.
9745          * Since we are merging two sorted lists, we take the smaller of the
9746          * next items.  In case of a tie, we take first the one that is in its
9747          * set.  If we first took the one not in its set, it would decrement
9748          * the count, possibly to 0 which would cause it to be output as ending
9749          * the range, and the next time through we would take the same number,
9750          * and output it again as beginning the next range.  By doing it the
9751          * opposite way, there is no possibility that the count will be
9752          * momentarily decremented to 0, and thus the two adjoining ranges will
9753          * be seamlessly merged.  (In a tie and both are in the set or both not
9754          * in the set, it doesn't matter which we take first.) */
9755         if (       array_a[i_a] < array_b[i_b]
9756             || (   array_a[i_a] == array_b[i_b]
9757                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9758         {
9759             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9760             cp = array_a[i_a++];
9761         }
9762         else {
9763             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9764             cp = array_b[i_b++];
9765         }
9766
9767         /* Here, have chosen which of the two inputs to look at.  Only output
9768          * if the running count changes to/from 0, which marks the
9769          * beginning/end of a range that's in the set */
9770         if (cp_in_set) {
9771             if (count == 0) {
9772                 array_u[i_u++] = cp;
9773             }
9774             count++;
9775         }
9776         else {
9777             count--;
9778             if (count == 0) {
9779                 array_u[i_u++] = cp;
9780             }
9781         }
9782     }
9783
9784
9785     /* The loop above increments the index into exactly one of the input lists
9786      * each iteration, and ends when either index gets to its list end.  That
9787      * means the other index is lower than its end, and so something is
9788      * remaining in that one.  We decrement 'count', as explained below, if
9789      * that list is in its set.  (i_a and i_b each currently index the element
9790      * beyond the one we care about.) */
9791     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9792         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9793     {
9794         count--;
9795     }
9796
9797     /* Above we decremented 'count' if the list that had unexamined elements in
9798      * it was in its set.  This has made it so that 'count' being non-zero
9799      * means there isn't anything left to output; and 'count' equal to 0 means
9800      * that what is left to output is precisely that which is left in the
9801      * non-exhausted input list.
9802      *
9803      * To see why, note first that the exhausted input obviously has nothing
9804      * left to add to the union.  If it was in its set at its end, that means
9805      * the set extends from here to the platform's infinity, and hence so does
9806      * the union and the non-exhausted set is irrelevant.  The exhausted set
9807      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9808      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9809      * 'count' remains at 1.  This is consistent with the decremented 'count'
9810      * != 0 meaning there's nothing left to add to the union.
9811      *
9812      * But if the exhausted input wasn't in its set, it contributed 0 to
9813      * 'count', and the rest of the union will be whatever the other input is.
9814      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9815      * otherwise it gets decremented to 0.  This is consistent with 'count'
9816      * == 0 meaning the remainder of the union is whatever is left in the
9817      * non-exhausted list. */
9818     if (count != 0) {
9819         len_u = i_u;
9820     }
9821     else {
9822         IV copy_count = len_a - i_a;
9823         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9824             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9825         }
9826         else { /* The non-exhausted input is b */
9827             copy_count = len_b - i_b;
9828             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9829         }
9830         len_u = i_u + copy_count;
9831     }
9832
9833     /* Set the result to the final length, which can change the pointer to
9834      * array_u, so re-find it.  (Note that it is unlikely that this will
9835      * change, as we are shrinking the space, not enlarging it) */
9836     if (len_u != _invlist_len(u)) {
9837         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9838         invlist_trim(u);
9839         array_u = invlist_array(u);
9840     }
9841
9842     if (*output == NULL) {  /* Simply return the new inversion list */
9843         *output = u;
9844     }
9845     else {
9846         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9847          * could instead free '*output', and then set it to 'u', but experience
9848          * has shown [perl #127392] that if the input is a mortal, we can get a
9849          * huge build-up of these during regex compilation before they get
9850          * freed. */
9851         invlist_replace_list_destroys_src(*output, u);
9852         SvREFCNT_dec_NN(u);
9853     }
9854
9855     return;
9856 }
9857
9858 void
9859 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9860                                                const bool complement_b, SV** i)
9861 {
9862     /* Take the intersection of two inversion lists and point '*i' to it.  On
9863      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9864      * even 'a' or 'b').  If to an inversion list, the contents of the original
9865      * list will be replaced by the intersection.  The first list, 'a', may be
9866      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9867      * TRUE, the result will be the intersection of 'a' and the complement (or
9868      * inversion) of 'b' instead of 'b' directly.
9869      *
9870      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9871      * Richard Gillam, published by Addison-Wesley, and explained at some
9872      * length there.  The preface says to incorporate its examples into your
9873      * code at your own risk.  In fact, it had bugs
9874      *
9875      * The algorithm is like a merge sort, and is essentially the same as the
9876      * union above
9877      */
9878
9879     const UV* array_a;          /* a's array */
9880     const UV* array_b;
9881     UV len_a;   /* length of a's array */
9882     UV len_b;
9883
9884     SV* r;                   /* the resulting intersection */
9885     UV* array_r;
9886     UV len_r = 0;
9887
9888     UV i_a = 0;             /* current index into a's array */
9889     UV i_b = 0;
9890     UV i_r = 0;
9891
9892     /* running count of how many of the two inputs are postitioned at ranges
9893      * that are in their sets.  As explained in the algorithm source book,
9894      * items are stopped accumulating and are output when the count changes
9895      * to/from 2.  The count is incremented when we start a range that's in an
9896      * input's set, and decremented when we start a range that's not in a set.
9897      * Only when it is 2 are we in the intersection. */
9898     UV count = 0;
9899
9900     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9901     assert(a != b);
9902     assert(*i == NULL || is_invlist(*i));
9903
9904     /* Special case if either one is empty */
9905     len_a = (a == NULL) ? 0 : _invlist_len(a);
9906     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9907         if (len_a != 0 && complement_b) {
9908
9909             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9910              * must be empty.  Here, also we are using 'b's complement, which
9911              * hence must be every possible code point.  Thus the intersection
9912              * is simply 'a'. */
9913
9914             if (*i == a) {  /* No-op */
9915                 return;
9916             }
9917
9918             if (*i == NULL) {
9919                 *i = invlist_clone(a, NULL);
9920                 return;
9921             }
9922
9923             r = invlist_clone(a, NULL);
9924             invlist_replace_list_destroys_src(*i, r);
9925             SvREFCNT_dec_NN(r);
9926             return;
9927         }
9928
9929         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9930          * intersection must be empty */
9931         if (*i == NULL) {
9932             *i = _new_invlist(0);
9933             return;
9934         }
9935
9936         invlist_clear(*i);
9937         return;
9938     }
9939
9940     /* Here both lists exist and are non-empty */
9941     array_a = invlist_array(a);
9942     array_b = invlist_array(b);
9943
9944     /* If are to take the intersection of 'a' with the complement of b, set it
9945      * up so are looking at b's complement. */
9946     if (complement_b) {
9947
9948         /* To complement, we invert: if the first element is 0, remove it.  To
9949          * do this, we just pretend the array starts one later */
9950         if (array_b[0] == 0) {
9951             array_b++;
9952             len_b--;
9953         }
9954         else {
9955
9956             /* But if the first element is not zero, we pretend the list starts
9957              * at the 0 that is always stored immediately before the array. */
9958             array_b--;
9959             len_b++;
9960         }
9961     }
9962
9963     /* Size the intersection for the worst case: that the intersection ends up
9964      * fragmenting everything to be completely disjoint */
9965     r= _new_invlist(len_a + len_b);
9966
9967     /* Will contain U+0000 iff both components do */
9968     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9969                                      && len_b > 0 && array_b[0] == 0);
9970
9971     /* Go through each list item by item, stopping when have exhausted one of
9972      * them */
9973     while (i_a < len_a && i_b < len_b) {
9974         UV cp;      /* The element to potentially add to the intersection's
9975                        array */
9976         bool cp_in_set; /* Is it in the input list's set or not */
9977
9978         /* We need to take one or the other of the two inputs for the
9979          * intersection.  Since we are merging two sorted lists, we take the
9980          * smaller of the next items.  In case of a tie, we take first the one
9981          * that is not in its set (a difference from the union algorithm).  If
9982          * we first took the one in its set, it would increment the count,
9983          * possibly to 2 which would cause it to be output as starting a range
9984          * in the intersection, and the next time through we would take that
9985          * same number, and output it again as ending the set.  By doing the
9986          * opposite of this, there is no possibility that the count will be
9987          * momentarily incremented to 2.  (In a tie and both are in the set or
9988          * both not in the set, it doesn't matter which we take first.) */
9989         if (       array_a[i_a] < array_b[i_b]
9990             || (   array_a[i_a] == array_b[i_b]
9991                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9992         {
9993             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9994             cp = array_a[i_a++];
9995         }
9996         else {
9997             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9998             cp= array_b[i_b++];
9999         }
10000
10001         /* Here, have chosen which of the two inputs to look at.  Only output
10002          * if the running count changes to/from 2, which marks the
10003          * beginning/end of a range that's in the intersection */
10004         if (cp_in_set) {
10005             count++;
10006             if (count == 2) {
10007                 array_r[i_r++] = cp;
10008             }
10009         }
10010         else {
10011             if (count == 2) {
10012                 array_r[i_r++] = cp;
10013             }
10014             count--;
10015         }
10016
10017     }
10018
10019     /* The loop above increments the index into exactly one of the input lists
10020      * each iteration, and ends when either index gets to its list end.  That
10021      * means the other index is lower than its end, and so something is
10022      * remaining in that one.  We increment 'count', as explained below, if the
10023      * exhausted list was in its set.  (i_a and i_b each currently index the
10024      * element beyond the one we care about.) */
10025     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10026         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10027     {
10028         count++;
10029     }
10030
10031     /* Above we incremented 'count' if the exhausted list was in its set.  This
10032      * has made it so that 'count' being below 2 means there is nothing left to
10033      * output; otheriwse what's left to add to the intersection is precisely
10034      * that which is left in the non-exhausted input list.
10035      *
10036      * To see why, note first that the exhausted input obviously has nothing
10037      * left to affect the intersection.  If it was in its set at its end, that
10038      * means the set extends from here to the platform's infinity, and hence
10039      * anything in the non-exhausted's list will be in the intersection, and
10040      * anything not in it won't be.  Hence, the rest of the intersection is
10041      * precisely what's in the non-exhausted list  The exhausted set also
10042      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
10043      * it means 'count' is now at least 2.  This is consistent with the
10044      * incremented 'count' being >= 2 means to add the non-exhausted list to
10045      * the intersection.
10046      *
10047      * But if the exhausted input wasn't in its set, it contributed 0 to
10048      * 'count', and the intersection can't include anything further; the
10049      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
10050      * incremented.  This is consistent with 'count' being < 2 meaning nothing
10051      * further to add to the intersection. */
10052     if (count < 2) { /* Nothing left to put in the intersection. */
10053         len_r = i_r;
10054     }
10055     else { /* copy the non-exhausted list, unchanged. */
10056         IV copy_count = len_a - i_a;
10057         if (copy_count > 0) {   /* a is the one with stuff left */
10058             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10059         }
10060         else {  /* b is the one with stuff left */
10061             copy_count = len_b - i_b;
10062             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10063         }
10064         len_r = i_r + copy_count;
10065     }
10066
10067     /* Set the result to the final length, which can change the pointer to
10068      * array_r, so re-find it.  (Note that it is unlikely that this will
10069      * change, as we are shrinking the space, not enlarging it) */
10070     if (len_r != _invlist_len(r)) {
10071         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10072         invlist_trim(r);
10073         array_r = invlist_array(r);
10074     }
10075
10076     if (*i == NULL) { /* Simply return the calculated intersection */
10077         *i = r;
10078     }
10079     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
10080               instead free '*i', and then set it to 'r', but experience has
10081               shown [perl #127392] that if the input is a mortal, we can get a
10082               huge build-up of these during regex compilation before they get
10083               freed. */
10084         if (len_r) {
10085             invlist_replace_list_destroys_src(*i, r);
10086         }
10087         else {
10088             invlist_clear(*i);
10089         }
10090         SvREFCNT_dec_NN(r);
10091     }
10092
10093     return;
10094 }
10095
10096 SV*
10097 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10098 {
10099     /* Add the range from 'start' to 'end' inclusive to the inversion list's
10100      * set.  A pointer to the inversion list is returned.  This may actually be
10101      * a new list, in which case the passed in one has been destroyed.  The
10102      * passed-in inversion list can be NULL, in which case a new one is created
10103      * with just the one range in it.  The new list is not necessarily
10104      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
10105      * result of this function.  The gain would not be large, and in many
10106      * cases, this is called multiple times on a single inversion list, so
10107      * anything freed may almost immediately be needed again.
10108      *
10109      * This used to mostly call the 'union' routine, but that is much more
10110      * heavyweight than really needed for a single range addition */
10111
10112     UV* array;              /* The array implementing the inversion list */
10113     UV len;                 /* How many elements in 'array' */
10114     SSize_t i_s;            /* index into the invlist array where 'start'
10115                                should go */
10116     SSize_t i_e = 0;        /* And the index where 'end' should go */
10117     UV cur_highest;         /* The highest code point in the inversion list
10118                                upon entry to this function */
10119
10120     /* This range becomes the whole inversion list if none already existed */
10121     if (invlist == NULL) {
10122         invlist = _new_invlist(2);
10123         _append_range_to_invlist(invlist, start, end);
10124         return invlist;
10125     }
10126
10127     /* Likewise, if the inversion list is currently empty */
10128     len = _invlist_len(invlist);
10129     if (len == 0) {
10130         _append_range_to_invlist(invlist, start, end);
10131         return invlist;
10132     }
10133
10134     /* Starting here, we have to know the internals of the list */
10135     array = invlist_array(invlist);
10136
10137     /* If the new range ends higher than the current highest ... */
10138     cur_highest = invlist_highest(invlist);
10139     if (end > cur_highest) {
10140
10141         /* If the whole range is higher, we can just append it */
10142         if (start > cur_highest) {
10143             _append_range_to_invlist(invlist, start, end);
10144             return invlist;
10145         }
10146
10147         /* Otherwise, add the portion that is higher ... */
10148         _append_range_to_invlist(invlist, cur_highest + 1, end);
10149
10150         /* ... and continue on below to handle the rest.  As a result of the
10151          * above append, we know that the index of the end of the range is the
10152          * final even numbered one of the array.  Recall that the final element
10153          * always starts a range that extends to infinity.  If that range is in
10154          * the set (meaning the set goes from here to infinity), it will be an
10155          * even index, but if it isn't in the set, it's odd, and the final
10156          * range in the set is one less, which is even. */
10157         if (end == UV_MAX) {
10158             i_e = len;
10159         }
10160         else {
10161             i_e = len - 2;
10162         }
10163     }
10164
10165     /* We have dealt with appending, now see about prepending.  If the new
10166      * range starts lower than the current lowest ... */
10167     if (start < array[0]) {
10168
10169         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10170          * Let the union code handle it, rather than having to know the
10171          * trickiness in two code places.  */
10172         if (UNLIKELY(start == 0)) {
10173             SV* range_invlist;
10174
10175             range_invlist = _new_invlist(2);
10176             _append_range_to_invlist(range_invlist, start, end);
10177
10178             _invlist_union(invlist, range_invlist, &invlist);
10179
10180             SvREFCNT_dec_NN(range_invlist);
10181
10182             return invlist;
10183         }
10184
10185         /* If the whole new range comes before the first entry, and doesn't
10186          * extend it, we have to insert it as an additional range */
10187         if (end < array[0] - 1) {
10188             i_s = i_e = -1;
10189             goto splice_in_new_range;
10190         }
10191
10192         /* Here the new range adjoins the existing first range, extending it
10193          * downwards. */
10194         array[0] = start;
10195
10196         /* And continue on below to handle the rest.  We know that the index of
10197          * the beginning of the range is the first one of the array */
10198         i_s = 0;
10199     }
10200     else { /* Not prepending any part of the new range to the existing list.
10201             * Find where in the list it should go.  This finds i_s, such that:
10202             *     invlist[i_s] <= start < array[i_s+1]
10203             */
10204         i_s = _invlist_search(invlist, start);
10205     }
10206
10207     /* At this point, any extending before the beginning of the inversion list
10208      * and/or after the end has been done.  This has made it so that, in the
10209      * code below, each endpoint of the new range is either in a range that is
10210      * in the set, or is in a gap between two ranges that are.  This means we
10211      * don't have to worry about exceeding the array bounds.
10212      *
10213      * Find where in the list the new range ends (but we can skip this if we
10214      * have already determined what it is, or if it will be the same as i_s,
10215      * which we already have computed) */
10216     if (i_e == 0) {
10217         i_e = (start == end)
10218               ? i_s
10219               : _invlist_search(invlist, end);
10220     }
10221
10222     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10223      * is a range that goes to infinity there is no element at invlist[i_e+1],
10224      * so only the first relation holds. */
10225
10226     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10227
10228         /* Here, the ranges on either side of the beginning of the new range
10229          * are in the set, and this range starts in the gap between them.
10230          *
10231          * The new range extends the range above it downwards if the new range
10232          * ends at or above that range's start */
10233         const bool extends_the_range_above = (   end == UV_MAX
10234                                               || end + 1 >= array[i_s+1]);
10235
10236         /* The new range extends the range below it upwards if it begins just
10237          * after where that range ends */
10238         if (start == array[i_s]) {
10239
10240             /* If the new range fills the entire gap between the other ranges,
10241              * they will get merged together.  Other ranges may also get
10242              * merged, depending on how many of them the new range spans.  In
10243              * the general case, we do the merge later, just once, after we
10244              * figure out how many to merge.  But in the case where the new
10245              * range exactly spans just this one gap (possibly extending into
10246              * the one above), we do the merge here, and an early exit.  This
10247              * is done here to avoid having to special case later. */
10248             if (i_e - i_s <= 1) {
10249
10250                 /* If i_e - i_s == 1, it means that the new range terminates
10251                  * within the range above, and hence 'extends_the_range_above'
10252                  * must be true.  (If the range above it extends to infinity,
10253                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10254                  * will be 0, so no harm done.) */
10255                 if (extends_the_range_above) {
10256                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10257                     invlist_set_len(invlist,
10258                                     len - 2,
10259                                     *(get_invlist_offset_addr(invlist)));
10260                     return invlist;
10261                 }
10262
10263                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10264                  * to the same range, and below we are about to decrement i_s
10265                  * */
10266                 i_e--;
10267             }
10268
10269             /* Here, the new range is adjacent to the one below.  (It may also
10270              * span beyond the range above, but that will get resolved later.)
10271              * Extend the range below to include this one. */
10272             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10273             i_s--;
10274             start = array[i_s];
10275         }
10276         else if (extends_the_range_above) {
10277
10278             /* Here the new range only extends the range above it, but not the
10279              * one below.  It merges with the one above.  Again, we keep i_e
10280              * and i_s in sync if they point to the same range */
10281             if (i_e == i_s) {
10282                 i_e++;
10283             }
10284             i_s++;
10285             array[i_s] = start;
10286         }
10287     }
10288
10289     /* Here, we've dealt with the new range start extending any adjoining
10290      * existing ranges.
10291      *
10292      * If the new range extends to infinity, it is now the final one,
10293      * regardless of what was there before */
10294     if (UNLIKELY(end == UV_MAX)) {
10295         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10296         return invlist;
10297     }
10298
10299     /* If i_e started as == i_s, it has also been dealt with,
10300      * and been updated to the new i_s, which will fail the following if */
10301     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10302
10303         /* Here, the ranges on either side of the end of the new range are in
10304          * the set, and this range ends in the gap between them.
10305          *
10306          * If this range is adjacent to (hence extends) the range above it, it
10307          * becomes part of that range; likewise if it extends the range below,
10308          * it becomes part of that range */
10309         if (end + 1 == array[i_e+1]) {
10310             i_e++;
10311             array[i_e] = start;
10312         }
10313         else if (start <= array[i_e]) {
10314             array[i_e] = end + 1;
10315             i_e--;
10316         }
10317     }
10318
10319     if (i_s == i_e) {
10320
10321         /* If the range fits entirely in an existing range (as possibly already
10322          * extended above), it doesn't add anything new */
10323         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10324             return invlist;
10325         }
10326
10327         /* Here, no part of the range is in the list.  Must add it.  It will
10328          * occupy 2 more slots */
10329       splice_in_new_range:
10330
10331         invlist_extend(invlist, len + 2);
10332         array = invlist_array(invlist);
10333         /* Move the rest of the array down two slots. Don't include any
10334          * trailing NUL */
10335         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10336
10337         /* Do the actual splice */
10338         array[i_e+1] = start;
10339         array[i_e+2] = end + 1;
10340         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10341         return invlist;
10342     }
10343
10344     /* Here the new range crossed the boundaries of a pre-existing range.  The
10345      * code above has adjusted things so that both ends are in ranges that are
10346      * in the set.  This means everything in between must also be in the set.
10347      * Just squash things together */
10348     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10349     invlist_set_len(invlist,
10350                     len - i_e + i_s,
10351                     *(get_invlist_offset_addr(invlist)));
10352
10353     return invlist;
10354 }
10355
10356 SV*
10357 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10358                                  UV** other_elements_ptr)
10359 {
10360     /* Create and return an inversion list whose contents are to be populated
10361      * by the caller.  The caller gives the number of elements (in 'size') and
10362      * the very first element ('element0').  This function will set
10363      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10364      * are to be placed.
10365      *
10366      * Obviously there is some trust involved that the caller will properly
10367      * fill in the other elements of the array.
10368      *
10369      * (The first element needs to be passed in, as the underlying code does
10370      * things differently depending on whether it is zero or non-zero) */
10371
10372     SV* invlist = _new_invlist(size);
10373     bool offset;
10374
10375     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10376
10377     invlist = add_cp_to_invlist(invlist, element0);
10378     offset = *get_invlist_offset_addr(invlist);
10379
10380     invlist_set_len(invlist, size, offset);
10381     *other_elements_ptr = invlist_array(invlist) + 1;
10382     return invlist;
10383 }
10384
10385 #endif
10386
10387 #ifndef PERL_IN_XSUB_RE
10388 void
10389 Perl__invlist_invert(pTHX_ SV* const invlist)
10390 {
10391     /* Complement the input inversion list.  This adds a 0 if the list didn't
10392      * have a zero; removes it otherwise.  As described above, the data
10393      * structure is set up so that this is very efficient */
10394
10395     PERL_ARGS_ASSERT__INVLIST_INVERT;
10396
10397     assert(! invlist_is_iterating(invlist));
10398
10399     /* The inverse of matching nothing is matching everything */
10400     if (_invlist_len(invlist) == 0) {
10401         _append_range_to_invlist(invlist, 0, UV_MAX);
10402         return;
10403     }
10404
10405     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10406 }
10407
10408 SV*
10409 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10410 {
10411     /* Return a new inversion list that is a copy of the input one, which is
10412      * unchanged.  The new list will not be mortal even if the old one was. */
10413
10414     const STRLEN nominal_length = _invlist_len(invlist);
10415     const STRLEN physical_length = SvCUR(invlist);
10416     const bool offset = *(get_invlist_offset_addr(invlist));
10417
10418     PERL_ARGS_ASSERT_INVLIST_CLONE;
10419
10420     if (new_invlist == NULL) {
10421         new_invlist = _new_invlist(nominal_length);
10422     }
10423     else {
10424         sv_upgrade(new_invlist, SVt_INVLIST);
10425         initialize_invlist_guts(new_invlist, nominal_length);
10426     }
10427
10428     *(get_invlist_offset_addr(new_invlist)) = offset;
10429     invlist_set_len(new_invlist, nominal_length, offset);
10430     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10431
10432     return new_invlist;
10433 }
10434
10435 #endif
10436
10437 PERL_STATIC_INLINE UV
10438 S_invlist_lowest(SV* const invlist)
10439 {
10440     /* Returns the lowest code point that matches an inversion list.  This API
10441      * has an ambiguity, as it returns 0 under either the lowest is actually
10442      * 0, or if the list is empty.  If this distinction matters to you, check
10443      * for emptiness before calling this function */
10444
10445     UV len = _invlist_len(invlist);
10446     UV *array;
10447
10448     PERL_ARGS_ASSERT_INVLIST_LOWEST;
10449
10450     if (len == 0) {
10451         return 0;
10452     }
10453
10454     array = invlist_array(invlist);
10455
10456     return array[0];
10457 }
10458
10459 STATIC SV *
10460 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10461 {
10462     /* Get the contents of an inversion list into a string SV so that they can
10463      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10464      * traditionally done for debug tracing; otherwise it uses a format
10465      * suitable for just copying to the output, with blanks between ranges and
10466      * a dash between range components */
10467
10468     UV start, end;
10469     SV* output;
10470     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10471     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10472
10473     if (traditional_style) {
10474         output = newSVpvs("\n");
10475     }
10476     else {
10477         output = newSVpvs("");
10478     }
10479
10480     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10481
10482     assert(! invlist_is_iterating(invlist));
10483
10484     invlist_iterinit(invlist);
10485     while (invlist_iternext(invlist, &start, &end)) {
10486         if (end == UV_MAX) {
10487             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10488                                           start, intra_range_delimiter,
10489                                                  inter_range_delimiter);
10490         }
10491         else if (end != start) {
10492             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10493                                           start,
10494                                                    intra_range_delimiter,
10495                                                   end, inter_range_delimiter);
10496         }
10497         else {
10498             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10499                                           start, inter_range_delimiter);
10500         }
10501     }
10502
10503     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10504         SvCUR_set(output, SvCUR(output) - 1);
10505     }
10506
10507     return output;
10508 }
10509
10510 #ifndef PERL_IN_XSUB_RE
10511 void
10512 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10513                          const char * const indent, SV* const invlist)
10514 {
10515     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10516      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10517      * the string 'indent'.  The output looks like this:
10518          [0] 0x000A .. 0x000D
10519          [2] 0x0085
10520          [4] 0x2028 .. 0x2029
10521          [6] 0x3104 .. INFTY
10522      * This means that the first range of code points matched by the list are
10523      * 0xA through 0xD; the second range contains only the single code point
10524      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10525      * are used to define each range (except if the final range extends to
10526      * infinity, only a single element is needed).  The array index of the
10527      * first element for the corresponding range is given in brackets. */
10528
10529     UV start, end;
10530     STRLEN count = 0;
10531
10532     PERL_ARGS_ASSERT__INVLIST_DUMP;
10533
10534     if (invlist_is_iterating(invlist)) {
10535         Perl_dump_indent(aTHX_ level, file,
10536              "%sCan't dump inversion list because is in middle of iterating\n",
10537              indent);
10538         return;
10539     }
10540
10541     invlist_iterinit(invlist);
10542     while (invlist_iternext(invlist, &start, &end)) {
10543         if (end == UV_MAX) {
10544             Perl_dump_indent(aTHX_ level, file,
10545                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10546                                    indent, (UV)count, start);
10547         }
10548         else if (end != start) {
10549             Perl_dump_indent(aTHX_ level, file,
10550                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10551                                 indent, (UV)count, start,         end);
10552         }
10553         else {
10554             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10555                                             indent, (UV)count, start);
10556         }
10557         count += 2;
10558     }
10559 }
10560
10561 #endif
10562
10563 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10564 bool
10565 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10566 {
10567     /* Return a boolean as to if the two passed in inversion lists are
10568      * identical.  The final argument, if TRUE, says to take the complement of
10569      * the second inversion list before doing the comparison */
10570
10571     const UV len_a = _invlist_len(a);
10572     UV len_b = _invlist_len(b);
10573
10574     const UV* array_a = NULL;
10575     const UV* array_b = NULL;
10576
10577     PERL_ARGS_ASSERT__INVLISTEQ;
10578
10579     /* This code avoids accessing the arrays unless it knows the length is
10580      * non-zero */
10581
10582     if (len_a == 0) {
10583         if (len_b == 0) {
10584             return ! complement_b;
10585         }
10586     }
10587     else {
10588         array_a = invlist_array(a);
10589     }
10590
10591     if (len_b != 0) {
10592         array_b = invlist_array(b);
10593     }
10594
10595     /* If are to compare 'a' with the complement of b, set it
10596      * up so are looking at b's complement. */
10597     if (complement_b) {
10598
10599         /* The complement of nothing is everything, so <a> would have to have
10600          * just one element, starting at zero (ending at infinity) */
10601         if (len_b == 0) {
10602             return (len_a == 1 && array_a[0] == 0);
10603         }
10604         if (array_b[0] == 0) {
10605
10606             /* Otherwise, to complement, we invert.  Here, the first element is
10607              * 0, just remove it.  To do this, we just pretend the array starts
10608              * one later */
10609
10610             array_b++;
10611             len_b--;
10612         }
10613         else {
10614
10615             /* But if the first element is not zero, we pretend the list starts
10616              * at the 0 that is always stored immediately before the array. */
10617             array_b--;
10618             len_b++;
10619         }
10620     }
10621
10622     return    len_a == len_b
10623            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10624
10625 }
10626 #endif
10627
10628 /*
10629  * As best we can, determine the characters that can match the start of
10630  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10631  * can be false positive matches
10632  *
10633  * Returns the invlist as a new SV*; it is the caller's responsibility to
10634  * call SvREFCNT_dec() when done with it.
10635  */
10636 STATIC SV*
10637 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10638 {
10639     const U8 * s = (U8*)STRING(node);
10640     SSize_t bytelen = STR_LEN(node);
10641     UV uc;
10642     /* Start out big enough for 2 separate code points */
10643     SV* invlist = _new_invlist(4);
10644
10645     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10646
10647     if (! UTF) {
10648         uc = *s;
10649
10650         /* We punt and assume can match anything if the node begins
10651          * with a multi-character fold.  Things are complicated.  For
10652          * example, /ffi/i could match any of:
10653          *  "\N{LATIN SMALL LIGATURE FFI}"
10654          *  "\N{LATIN SMALL LIGATURE FF}I"
10655          *  "F\N{LATIN SMALL LIGATURE FI}"
10656          *  plus several other things; and making sure we have all the
10657          *  possibilities is hard. */
10658         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10659             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10660         }
10661         else {
10662             /* Any Latin1 range character can potentially match any
10663              * other depending on the locale, and in Turkic locales, U+130 and
10664              * U+131 */
10665             if (OP(node) == EXACTFL) {
10666                 _invlist_union(invlist, PL_Latin1, &invlist);
10667                 invlist = add_cp_to_invlist(invlist,
10668                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10669                 invlist = add_cp_to_invlist(invlist,
10670                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10671             }
10672             else {
10673                 /* But otherwise, it matches at least itself.  We can
10674                  * quickly tell if it has a distinct fold, and if so,
10675                  * it matches that as well */
10676                 invlist = add_cp_to_invlist(invlist, uc);
10677                 if (IS_IN_SOME_FOLD_L1(uc))
10678                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10679             }
10680
10681             /* Some characters match above-Latin1 ones under /i.  This
10682              * is true of EXACTFL ones when the locale is UTF-8 */
10683             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10684                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10685                                     && OP(node) != EXACTFAA_NO_TRIE)))
10686             {
10687                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10688             }
10689         }
10690     }
10691     else {  /* Pattern is UTF-8 */
10692         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10693         const U8* e = s + bytelen;
10694         IV fc;
10695
10696         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10697
10698         /* The only code points that aren't folded in a UTF EXACTFish
10699          * node are the problematic ones in EXACTFL nodes */
10700         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10701             /* We need to check for the possibility that this EXACTFL
10702              * node begins with a multi-char fold.  Therefore we fold
10703              * the first few characters of it so that we can make that
10704              * check */
10705             U8 *d = folded;
10706             int i;
10707
10708             fc = -1;
10709             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10710                 if (isASCII(*s)) {
10711                     *(d++) = (U8) toFOLD(*s);
10712                     if (fc < 0) {       /* Save the first fold */
10713                         fc = *(d-1);
10714                     }
10715                     s++;
10716                 }
10717                 else {
10718                     STRLEN len;
10719                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10720                     if (fc < 0) {       /* Save the first fold */
10721                         fc = fold;
10722                     }
10723                     d += len;
10724                     s += UTF8SKIP(s);
10725                 }
10726             }
10727
10728             /* And set up so the code below that looks in this folded
10729              * buffer instead of the node's string */
10730             e = d;
10731             s = folded;
10732         }
10733
10734         /* When we reach here 's' points to the fold of the first
10735          * character(s) of the node; and 'e' points to far enough along
10736          * the folded string to be just past any possible multi-char
10737          * fold.
10738          *
10739          * Unlike the non-UTF-8 case, the macro for determining if a
10740          * string is a multi-char fold requires all the characters to
10741          * already be folded.  This is because of all the complications
10742          * if not.  Note that they are folded anyway, except in EXACTFL
10743          * nodes.  Like the non-UTF case above, we punt if the node
10744          * begins with a multi-char fold  */
10745
10746         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10747             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10748         }
10749         else {  /* Single char fold */
10750             unsigned int k;
10751             U32 first_fold;
10752             const U32 * remaining_folds;
10753             Size_t folds_count;
10754
10755             /* It matches itself */
10756             invlist = add_cp_to_invlist(invlist, fc);
10757
10758             /* ... plus all the things that fold to it, which are found in
10759              * PL_utf8_foldclosures */
10760             folds_count = _inverse_folds(fc, &first_fold,
10761                                                 &remaining_folds);
10762             for (k = 0; k < folds_count; k++) {
10763                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10764
10765                 /* /aa doesn't allow folds between ASCII and non- */
10766                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10767                     && isASCII(c) != isASCII(fc))
10768                 {
10769                     continue;
10770                 }
10771
10772                 invlist = add_cp_to_invlist(invlist, c);
10773             }
10774
10775             if (OP(node) == EXACTFL) {
10776
10777                 /* If either [iI] are present in an EXACTFL node the above code
10778                  * should have added its normal case pair, but under a Turkish
10779                  * locale they could match instead the case pairs from it.  Add
10780                  * those as potential matches as well */
10781                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10782                     invlist = add_cp_to_invlist(invlist,
10783                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10784                     invlist = add_cp_to_invlist(invlist,
10785                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10786                 }
10787                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10788                     invlist = add_cp_to_invlist(invlist, 'I');
10789                 }
10790                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10791                     invlist = add_cp_to_invlist(invlist, 'i');
10792                 }
10793             }
10794         }
10795     }
10796
10797     return invlist;
10798 }
10799
10800 #undef HEADER_LENGTH
10801 #undef TO_INTERNAL_SIZE
10802 #undef FROM_INTERNAL_SIZE
10803 #undef INVLIST_VERSION_ID
10804
10805 /* End of inversion list object */
10806
10807 STATIC void
10808 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10809 {
10810     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10811      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10812      * should point to the first flag; it is updated on output to point to the
10813      * final ')' or ':'.  There needs to be at least one flag, or this will
10814      * abort */
10815
10816     /* for (?g), (?gc), and (?o) warnings; warning
10817        about (?c) will warn about (?g) -- japhy    */
10818
10819 #define WASTED_O  0x01
10820 #define WASTED_G  0x02
10821 #define WASTED_C  0x04
10822 #define WASTED_GC (WASTED_G|WASTED_C)
10823     I32 wastedflags = 0x00;
10824     U32 posflags = 0, negflags = 0;
10825     U32 *flagsp = &posflags;
10826     char has_charset_modifier = '\0';
10827     regex_charset cs;
10828     bool has_use_defaults = FALSE;
10829     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10830     int x_mod_count = 0;
10831
10832     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10833
10834     /* '^' as an initial flag sets certain defaults */
10835     if (UCHARAT(RExC_parse) == '^') {
10836         RExC_parse++;
10837         has_use_defaults = TRUE;
10838         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10839         cs = (RExC_uni_semantics)
10840              ? REGEX_UNICODE_CHARSET
10841              : REGEX_DEPENDS_CHARSET;
10842         set_regex_charset(&RExC_flags, cs);
10843     }
10844     else {
10845         cs = get_regex_charset(RExC_flags);
10846         if (   cs == REGEX_DEPENDS_CHARSET
10847             && RExC_uni_semantics)
10848         {
10849             cs = REGEX_UNICODE_CHARSET;
10850         }
10851     }
10852
10853     while (RExC_parse < RExC_end) {
10854         /* && memCHRs("iogcmsx", *RExC_parse) */
10855         /* (?g), (?gc) and (?o) are useless here
10856            and must be globally applied -- japhy */
10857         if ((RExC_pm_flags & PMf_WILDCARD)) {
10858             if (flagsp == & negflags) {
10859                 if (*RExC_parse == 'm') {
10860                     RExC_parse++;
10861                     /* diag_listed_as: Use of %s is not allowed in Unicode
10862                        property wildcard subpatterns in regex; marked by <--
10863                        HERE in m/%s/ */
10864                     vFAIL("Use of modifier '-m' is not allowed in Unicode"
10865                           " property wildcard subpatterns");
10866                 }
10867             }
10868             else {
10869                 if (*RExC_parse == 's') {
10870                     goto modifier_illegal_in_wildcard;
10871                 }
10872             }
10873         }
10874
10875         switch (*RExC_parse) {
10876
10877             /* Code for the imsxn flags */
10878             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10879
10880             case LOCALE_PAT_MOD:
10881                 if (has_charset_modifier) {
10882                     goto excess_modifier;
10883                 }
10884                 else if (flagsp == &negflags) {
10885                     goto neg_modifier;
10886                 }
10887                 cs = REGEX_LOCALE_CHARSET;
10888                 has_charset_modifier = LOCALE_PAT_MOD;
10889                 break;
10890             case UNICODE_PAT_MOD:
10891                 if (has_charset_modifier) {
10892                     goto excess_modifier;
10893                 }
10894                 else if (flagsp == &negflags) {
10895                     goto neg_modifier;
10896                 }
10897                 cs = REGEX_UNICODE_CHARSET;
10898                 has_charset_modifier = UNICODE_PAT_MOD;
10899                 break;
10900             case ASCII_RESTRICT_PAT_MOD:
10901                 if (flagsp == &negflags) {
10902                     goto neg_modifier;
10903                 }
10904                 if (has_charset_modifier) {
10905                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10906                         goto excess_modifier;
10907                     }
10908                     /* Doubled modifier implies more restricted */
10909                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10910                 }
10911                 else {
10912                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10913                 }
10914                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10915                 break;
10916             case DEPENDS_PAT_MOD:
10917                 if (has_use_defaults) {
10918                     goto fail_modifiers;
10919                 }
10920                 else if (flagsp == &negflags) {
10921                     goto neg_modifier;
10922                 }
10923                 else if (has_charset_modifier) {
10924                     goto excess_modifier;
10925                 }
10926
10927                 /* The dual charset means unicode semantics if the
10928                  * pattern (or target, not known until runtime) are
10929                  * utf8, or something in the pattern indicates unicode
10930                  * semantics */
10931                 cs = (RExC_uni_semantics)
10932                      ? REGEX_UNICODE_CHARSET
10933                      : REGEX_DEPENDS_CHARSET;
10934                 has_charset_modifier = DEPENDS_PAT_MOD;
10935                 break;
10936               excess_modifier:
10937                 RExC_parse++;
10938                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10939                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10940                 }
10941                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10942                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10943                                         *(RExC_parse - 1));
10944                 }
10945                 else {
10946                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10947                 }
10948                 NOT_REACHED; /*NOTREACHED*/
10949               neg_modifier:
10950                 RExC_parse++;
10951                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10952                                     *(RExC_parse - 1));
10953                 NOT_REACHED; /*NOTREACHED*/
10954             case GLOBAL_PAT_MOD: /* 'g' */
10955                 if (RExC_pm_flags & PMf_WILDCARD) {
10956                     goto modifier_illegal_in_wildcard;
10957                 }
10958                 /*FALLTHROUGH*/
10959             case ONCE_PAT_MOD: /* 'o' */
10960                 if (ckWARN(WARN_REGEXP)) {
10961                     const I32 wflagbit = *RExC_parse == 'o'
10962                                          ? WASTED_O
10963                                          : WASTED_G;
10964                     if (! (wastedflags & wflagbit) ) {
10965                         wastedflags |= wflagbit;
10966                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10967                         vWARN5(
10968                             RExC_parse + 1,
10969                             "Useless (%s%c) - %suse /%c modifier",
10970                             flagsp == &negflags ? "?-" : "?",
10971                             *RExC_parse,
10972                             flagsp == &negflags ? "don't " : "",
10973                             *RExC_parse
10974                         );
10975                     }
10976                 }
10977                 break;
10978
10979             case CONTINUE_PAT_MOD: /* 'c' */
10980                 if (RExC_pm_flags & PMf_WILDCARD) {
10981                     goto modifier_illegal_in_wildcard;
10982                 }
10983                 if (ckWARN(WARN_REGEXP)) {
10984                     if (! (wastedflags & WASTED_C) ) {
10985                         wastedflags |= WASTED_GC;
10986                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10987                         vWARN3(
10988                             RExC_parse + 1,
10989                             "Useless (%sc) - %suse /gc modifier",
10990                             flagsp == &negflags ? "?-" : "?",
10991                             flagsp == &negflags ? "don't " : ""
10992                         );
10993                     }
10994                 }
10995                 break;
10996             case KEEPCOPY_PAT_MOD: /* 'p' */
10997                 if (RExC_pm_flags & PMf_WILDCARD) {
10998                     goto modifier_illegal_in_wildcard;
10999                 }
11000                 if (flagsp == &negflags) {
11001                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
11002                 } else {
11003                     *flagsp |= RXf_PMf_KEEPCOPY;
11004                 }
11005                 break;
11006             case '-':
11007                 /* A flag is a default iff it is following a minus, so
11008                  * if there is a minus, it means will be trying to
11009                  * re-specify a default which is an error */
11010                 if (has_use_defaults || flagsp == &negflags) {
11011                     goto fail_modifiers;
11012                 }
11013                 flagsp = &negflags;
11014                 wastedflags = 0;  /* reset so (?g-c) warns twice */
11015                 x_mod_count = 0;
11016                 break;
11017             case ':':
11018             case ')':
11019
11020                 if (  (RExC_pm_flags & PMf_WILDCARD)
11021                     && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11022                 {
11023                     RExC_parse++;
11024                     /* diag_listed_as: Use of %s is not allowed in Unicode
11025                        property wildcard subpatterns in regex; marked by <--
11026                        HERE in m/%s/ */
11027                     vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11028                            " property wildcard subpatterns",
11029                            has_charset_modifier);
11030                 }
11031
11032                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11033                     negflags |= RXf_PMf_EXTENDED_MORE;
11034                 }
11035                 RExC_flags |= posflags;
11036
11037                 if (negflags & RXf_PMf_EXTENDED) {
11038                     negflags |= RXf_PMf_EXTENDED_MORE;
11039                 }
11040                 RExC_flags &= ~negflags;
11041                 set_regex_charset(&RExC_flags, cs);
11042
11043                 return;
11044             default:
11045               fail_modifiers:
11046                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11047                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11048                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11049                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11050                 NOT_REACHED; /*NOTREACHED*/
11051         }
11052
11053         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11054     }
11055
11056     vFAIL("Sequence (?... not terminated");
11057
11058   modifier_illegal_in_wildcard:
11059     RExC_parse++;
11060     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11061        subpatterns in regex; marked by <-- HERE in m/%s/ */
11062     vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11063            " subpatterns", *(RExC_parse - 1));
11064 }
11065
11066 /*
11067  - reg - regular expression, i.e. main body or parenthesized thing
11068  *
11069  * Caller must absorb opening parenthesis.
11070  *
11071  * Combining parenthesis handling with the base level of regular expression
11072  * is a trifle forced, but the need to tie the tails of the branches to what
11073  * follows makes it hard to avoid.
11074  */
11075 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11076 #ifdef DEBUGGING
11077 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11078 #else
11079 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11080 #endif
11081
11082 STATIC regnode_offset
11083 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11084                              I32 *flagp,
11085                              char * parse_start,
11086                              char ch
11087                       )
11088 {
11089     regnode_offset ret;
11090     char* name_start = RExC_parse;
11091     U32 num = 0;
11092     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11093     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11094
11095     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11096
11097     if (RExC_parse == name_start || *RExC_parse != ch) {
11098         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11099         vFAIL2("Sequence %.3s... not terminated", parse_start);
11100     }
11101
11102     if (sv_dat) {
11103         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11104         RExC_rxi->data->data[num]=(void*)sv_dat;
11105         SvREFCNT_inc_simple_void_NN(sv_dat);
11106     }
11107     RExC_sawback = 1;
11108     ret = reganode(pRExC_state,
11109                    ((! FOLD)
11110                      ? REFN
11111                      : (ASCII_FOLD_RESTRICTED)
11112                        ? REFFAN
11113                        : (AT_LEAST_UNI_SEMANTICS)
11114                          ? REFFUN
11115                          : (LOC)
11116                            ? REFFLN
11117                            : REFFN),
11118                     num);
11119     *flagp |= HASWIDTH;
11120
11121     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11122     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11123
11124     nextchar(pRExC_state);
11125     return ret;
11126 }
11127
11128 /* On success, returns the offset at which any next node should be placed into
11129  * the regex engine program being compiled.
11130  *
11131  * Returns 0 otherwise, with *flagp set to indicate why:
11132  *  TRYAGAIN        at the end of (?) that only sets flags.
11133  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11134  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11135  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11136  *  happen.  */
11137 STATIC regnode_offset
11138 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11139     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11140      * 2 is like 1, but indicates that nextchar() has been called to advance
11141      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11142      * this flag alerts us to the need to check for that */
11143 {
11144     regnode_offset ret = 0;    /* Will be the head of the group. */
11145     regnode_offset br;
11146     regnode_offset lastbr;
11147     regnode_offset ender = 0;
11148     I32 parno = 0;
11149     I32 flags;
11150     U32 oregflags = RExC_flags;
11151     bool have_branch = 0;
11152     bool is_open = 0;
11153     I32 freeze_paren = 0;
11154     I32 after_freeze = 0;
11155     I32 num; /* numeric backreferences */
11156     SV * max_open;  /* Max number of unclosed parens */
11157
11158     char * parse_start = RExC_parse; /* MJD */
11159     char * const oregcomp_parse = RExC_parse;
11160
11161     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11162
11163     PERL_ARGS_ASSERT_REG;
11164     DEBUG_PARSE("reg ");
11165
11166     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11167     assert(max_open);
11168     if (!SvIOK(max_open)) {
11169         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11170     }
11171     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11172                                               open paren */
11173         vFAIL("Too many nested open parens");
11174     }
11175
11176     *flagp = 0;                         /* Initialize. */
11177
11178     if (RExC_in_lookbehind) {
11179         RExC_in_lookbehind++;
11180     }
11181     if (RExC_in_lookahead) {
11182         RExC_in_lookahead++;
11183     }
11184
11185     /* Having this true makes it feasible to have a lot fewer tests for the
11186      * parse pointer being in scope.  For example, we can write
11187      *      while(isFOO(*RExC_parse)) RExC_parse++;
11188      * instead of
11189      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11190      */
11191     assert(*RExC_end == '\0');
11192
11193     /* Make an OPEN node, if parenthesized. */
11194     if (paren) {
11195
11196         /* Under /x, space and comments can be gobbled up between the '(' and
11197          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11198          * intervening space, as the sequence is a token, and a token should be
11199          * indivisible */
11200         bool has_intervening_patws = (paren == 2)
11201                                   && *(RExC_parse - 1) != '(';
11202
11203         if (RExC_parse >= RExC_end) {
11204             vFAIL("Unmatched (");
11205         }
11206
11207         if (paren == 'r') {     /* Atomic script run */
11208             paren = '>';
11209             goto parse_rest;
11210         }
11211         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11212             char *start_verb = RExC_parse + 1;
11213             STRLEN verb_len;
11214             char *start_arg = NULL;
11215             unsigned char op = 0;
11216             int arg_required = 0;
11217             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11218             bool has_upper = FALSE;
11219
11220             if (has_intervening_patws) {
11221                 RExC_parse++;   /* past the '*' */
11222
11223                 /* For strict backwards compatibility, don't change the message
11224                  * now that we also have lowercase operands */
11225                 if (isUPPER(*RExC_parse)) {
11226                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11227                 }
11228                 else {
11229                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11230                 }
11231             }
11232             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11233                 if ( *RExC_parse == ':' ) {
11234                     start_arg = RExC_parse + 1;
11235                     break;
11236                 }
11237                 else if (! UTF) {
11238                     if (isUPPER(*RExC_parse)) {
11239                         has_upper = TRUE;
11240                     }
11241                     RExC_parse++;
11242                 }
11243                 else {
11244                     RExC_parse += UTF8SKIP(RExC_parse);
11245                 }
11246             }
11247             verb_len = RExC_parse - start_verb;
11248             if ( start_arg ) {
11249                 if (RExC_parse >= RExC_end) {
11250                     goto unterminated_verb_pattern;
11251                 }
11252
11253                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11254                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11255                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11256                 }
11257                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11258                   unterminated_verb_pattern:
11259                     if (has_upper) {
11260                         vFAIL("Unterminated verb pattern argument");
11261                     }
11262                     else {
11263                         vFAIL("Unterminated '(*...' argument");
11264                     }
11265                 }
11266             } else {
11267                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11268                     if (has_upper) {
11269                         vFAIL("Unterminated verb pattern");
11270                     }
11271                     else {
11272                         vFAIL("Unterminated '(*...' construct");
11273                     }
11274                 }
11275             }
11276
11277             /* Here, we know that RExC_parse < RExC_end */
11278
11279             switch ( *start_verb ) {
11280             case 'A':  /* (*ACCEPT) */
11281                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11282                     op = ACCEPT;
11283                     internal_argval = RExC_nestroot;
11284                 }
11285                 break;
11286             case 'C':  /* (*COMMIT) */
11287                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11288                     op = COMMIT;
11289                 break;
11290             case 'F':  /* (*FAIL) */
11291                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11292                     op = OPFAIL;
11293                 }
11294                 break;
11295             case ':':  /* (*:NAME) */
11296             case 'M':  /* (*MARK:NAME) */
11297                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11298                     op = MARKPOINT;
11299                     arg_required = 1;
11300                 }
11301                 break;
11302             case 'P':  /* (*PRUNE) */
11303                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11304                     op = PRUNE;
11305                 break;
11306             case 'S':   /* (*SKIP) */
11307                 if ( memEQs(start_verb, verb_len,"SKIP") )
11308                     op = SKIP;
11309                 break;
11310             case 'T':  /* (*THEN) */
11311                 /* [19:06] <TimToady> :: is then */
11312                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11313                     op = CUTGROUP;
11314                     RExC_seen |= REG_CUTGROUP_SEEN;
11315                 }
11316                 break;
11317             case 'a':
11318                 if (   memEQs(start_verb, verb_len, "asr")
11319                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11320                 {
11321                     paren = 'r';        /* Mnemonic: recursed run */
11322                     goto script_run;
11323                 }
11324                 else if (memEQs(start_verb, verb_len, "atomic")) {
11325                     paren = 't';    /* AtOMIC */
11326                     goto alpha_assertions;
11327                 }
11328                 break;
11329             case 'p':
11330                 if (   memEQs(start_verb, verb_len, "plb")
11331                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11332                 {
11333                     paren = 'b';
11334                     goto lookbehind_alpha_assertions;
11335                 }
11336                 else if (   memEQs(start_verb, verb_len, "pla")
11337                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11338                 {
11339                     paren = 'a';
11340                     goto alpha_assertions;
11341                 }
11342                 break;
11343             case 'n':
11344                 if (   memEQs(start_verb, verb_len, "nlb")
11345                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11346                 {
11347                     paren = 'B';
11348                     goto lookbehind_alpha_assertions;
11349                 }
11350                 else if (   memEQs(start_verb, verb_len, "nla")
11351                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11352                 {
11353                     paren = 'A';
11354                     goto alpha_assertions;
11355                 }
11356                 break;
11357             case 's':
11358                 if (   memEQs(start_verb, verb_len, "sr")
11359                     || memEQs(start_verb, verb_len, "script_run"))
11360                 {
11361                     regnode_offset atomic;
11362
11363                     paren = 's';
11364
11365                    script_run:
11366
11367                     /* This indicates Unicode rules. */
11368                     REQUIRE_UNI_RULES(flagp, 0);
11369
11370                     if (! start_arg) {
11371                         goto no_colon;
11372                     }
11373
11374                     RExC_parse = start_arg;
11375
11376                     if (RExC_in_script_run) {
11377
11378                         /*  Nested script runs are treated as no-ops, because
11379                          *  if the nested one fails, the outer one must as
11380                          *  well.  It could fail sooner, and avoid (??{} with
11381                          *  side effects, but that is explicitly documented as
11382                          *  undefined behavior. */
11383
11384                         ret = 0;
11385
11386                         if (paren == 's') {
11387                             paren = ':';
11388                             goto parse_rest;
11389                         }
11390
11391                         /* But, the atomic part of a nested atomic script run
11392                          * isn't a no-op, but can be treated just like a '(?>'
11393                          * */
11394                         paren = '>';
11395                         goto parse_rest;
11396                     }
11397
11398                     if (paren == 's') {
11399                         /* Here, we're starting a new regular script run */
11400                         ret = reg_node(pRExC_state, SROPEN);
11401                         RExC_in_script_run = 1;
11402                         is_open = 1;
11403                         goto parse_rest;
11404                     }
11405
11406                     /* Here, we are starting an atomic script run.  This is
11407                      * handled by recursing to deal with the atomic portion
11408                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11409
11410                     ret = reg_node(pRExC_state, SROPEN);
11411
11412                     RExC_in_script_run = 1;
11413
11414                     atomic = reg(pRExC_state, 'r', &flags, depth);
11415                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11416                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11417                         return 0;
11418                     }
11419
11420                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11421                         REQUIRE_BRANCHJ(flagp, 0);
11422                     }
11423
11424                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11425                                                                 SRCLOSE)))
11426                     {
11427                         REQUIRE_BRANCHJ(flagp, 0);
11428                     }
11429
11430                     RExC_in_script_run = 0;
11431                     return ret;
11432                 }
11433
11434                 break;
11435
11436             lookbehind_alpha_assertions:
11437                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11438                 RExC_in_lookbehind++;
11439                 /*FALLTHROUGH*/
11440
11441             alpha_assertions:
11442
11443                 RExC_seen_zerolen++;
11444
11445                 if (! start_arg) {
11446                     goto no_colon;
11447                 }
11448
11449                 /* An empty negative lookahead assertion simply is failure */
11450                 if (paren == 'A' && RExC_parse == start_arg) {
11451                     ret=reganode(pRExC_state, OPFAIL, 0);
11452                     nextchar(pRExC_state);
11453                     return ret;
11454                 }
11455
11456                 RExC_parse = start_arg;
11457                 goto parse_rest;
11458
11459               no_colon:
11460                 vFAIL2utf8f(
11461                 "'(*%" UTF8f "' requires a terminating ':'",
11462                 UTF8fARG(UTF, verb_len, start_verb));
11463                 NOT_REACHED; /*NOTREACHED*/
11464
11465             } /* End of switch */
11466             if ( ! op ) {
11467                 RExC_parse += UTF
11468                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11469                               : 1;
11470                 if (has_upper || verb_len == 0) {
11471                     vFAIL2utf8f(
11472                     "Unknown verb pattern '%" UTF8f "'",
11473                     UTF8fARG(UTF, verb_len, start_verb));
11474                 }
11475                 else {
11476                     vFAIL2utf8f(
11477                     "Unknown '(*...)' construct '%" UTF8f "'",
11478                     UTF8fARG(UTF, verb_len, start_verb));
11479                 }
11480             }
11481             if ( RExC_parse == start_arg ) {
11482                 start_arg = NULL;
11483             }
11484             if ( arg_required && !start_arg ) {
11485                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11486                     (int) verb_len, start_verb);
11487             }
11488             if (internal_argval == -1) {
11489                 ret = reganode(pRExC_state, op, 0);
11490             } else {
11491                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11492             }
11493             RExC_seen |= REG_VERBARG_SEEN;
11494             if (start_arg) {
11495                 SV *sv = newSVpvn( start_arg,
11496                                     RExC_parse - start_arg);
11497                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11498                                         STR_WITH_LEN("S"));
11499                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11500                 FLAGS(REGNODE_p(ret)) = 1;
11501             } else {
11502                 FLAGS(REGNODE_p(ret)) = 0;
11503             }
11504             if ( internal_argval != -1 )
11505                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11506             nextchar(pRExC_state);
11507             return ret;
11508         }
11509         else if (*RExC_parse == '?') { /* (?...) */
11510             bool is_logical = 0;
11511             const char * const seqstart = RExC_parse;
11512             const char * endptr;
11513             const char non_existent_group_msg[]
11514                                             = "Reference to nonexistent group";
11515             const char impossible_group[] = "Invalid reference to group";
11516
11517             if (has_intervening_patws) {
11518                 RExC_parse++;
11519                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11520             }
11521
11522             RExC_parse++;           /* past the '?' */
11523             paren = *RExC_parse;    /* might be a trailing NUL, if not
11524                                        well-formed */
11525             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11526             if (RExC_parse > RExC_end) {
11527                 paren = '\0';
11528             }
11529             ret = 0;                    /* For look-ahead/behind. */
11530             switch (paren) {
11531
11532             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11533                 paren = *RExC_parse;
11534                 if ( paren == '<') {    /* (?P<...>) named capture */
11535                     RExC_parse++;
11536                     if (RExC_parse >= RExC_end) {
11537                         vFAIL("Sequence (?P<... not terminated");
11538                     }
11539                     goto named_capture;
11540                 }
11541                 else if (paren == '>') {   /* (?P>name) named recursion */
11542                     RExC_parse++;
11543                     if (RExC_parse >= RExC_end) {
11544                         vFAIL("Sequence (?P>... not terminated");
11545                     }
11546                     goto named_recursion;
11547                 }
11548                 else if (paren == '=') {   /* (?P=...)  named backref */
11549                     RExC_parse++;
11550                     return handle_named_backref(pRExC_state, flagp,
11551                                                 parse_start, ')');
11552                 }
11553                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11554                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11555                 vFAIL3("Sequence (%.*s...) not recognized",
11556                                 (int) (RExC_parse - seqstart), seqstart);
11557                 NOT_REACHED; /*NOTREACHED*/
11558             case '<':           /* (?<...) */
11559                 /* If you want to support (?<*...), first reconcile with GH #17363 */
11560                 if (*RExC_parse == '!')
11561                     paren = ',';
11562                 else if (*RExC_parse != '=')
11563               named_capture:
11564                 {               /* (?<...>) */
11565                     char *name_start;
11566                     SV *svname;
11567                     paren= '>';
11568                 /* FALLTHROUGH */
11569             case '\'':          /* (?'...') */
11570                     name_start = RExC_parse;
11571                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11572                     if (   RExC_parse == name_start
11573                         || RExC_parse >= RExC_end
11574                         || *RExC_parse != paren)
11575                     {
11576                         vFAIL2("Sequence (?%c... not terminated",
11577                             paren=='>' ? '<' : (char) paren);
11578                     }
11579                     {
11580                         HE *he_str;
11581                         SV *sv_dat = NULL;
11582                         if (!svname) /* shouldn't happen */
11583                             Perl_croak(aTHX_
11584                                 "panic: reg_scan_name returned NULL");
11585                         if (!RExC_paren_names) {
11586                             RExC_paren_names= newHV();
11587                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11588 #ifdef DEBUGGING
11589                             RExC_paren_name_list= newAV();
11590                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11591 #endif
11592                         }
11593                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11594                         if ( he_str )
11595                             sv_dat = HeVAL(he_str);
11596                         if ( ! sv_dat ) {
11597                             /* croak baby croak */
11598                             Perl_croak(aTHX_
11599                                 "panic: paren_name hash element allocation failed");
11600                         } else if ( SvPOK(sv_dat) ) {
11601                             /* (?|...) can mean we have dupes so scan to check
11602                                its already been stored. Maybe a flag indicating
11603                                we are inside such a construct would be useful,
11604                                but the arrays are likely to be quite small, so
11605                                for now we punt -- dmq */
11606                             IV count = SvIV(sv_dat);
11607                             I32 *pv = (I32*)SvPVX(sv_dat);
11608                             IV i;
11609                             for ( i = 0 ; i < count ; i++ ) {
11610                                 if ( pv[i] == RExC_npar ) {
11611                                     count = 0;
11612                                     break;
11613                                 }
11614                             }
11615                             if ( count ) {
11616                                 pv = (I32*)SvGROW(sv_dat,
11617                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11618                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11619                                 pv[count] = RExC_npar;
11620                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11621                             }
11622                         } else {
11623                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11624                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11625                                                                 sizeof(I32));
11626                             SvIOK_on(sv_dat);
11627                             SvIV_set(sv_dat, 1);
11628                         }
11629 #ifdef DEBUGGING
11630                         /* Yes this does cause a memory leak in debugging Perls
11631                          * */
11632                         if (!av_store(RExC_paren_name_list,
11633                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11634                             SvREFCNT_dec_NN(svname);
11635 #endif
11636
11637                         /*sv_dump(sv_dat);*/
11638                     }
11639                     nextchar(pRExC_state);
11640                     paren = 1;
11641                     goto capturing_parens;
11642                 }
11643
11644                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11645                 RExC_in_lookbehind++;
11646                 RExC_parse++;
11647                 if (RExC_parse >= RExC_end) {
11648                     vFAIL("Sequence (?... not terminated");
11649                 }
11650                 RExC_seen_zerolen++;
11651                 break;
11652             case '=':           /* (?=...) */
11653                 RExC_seen_zerolen++;
11654                 RExC_in_lookahead++;
11655                 break;
11656             case '!':           /* (?!...) */
11657                 RExC_seen_zerolen++;
11658                 /* check if we're really just a "FAIL" assertion */
11659                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11660                                         FALSE /* Don't force to /x */ );
11661                 if (*RExC_parse == ')') {
11662                     ret=reganode(pRExC_state, OPFAIL, 0);
11663                     nextchar(pRExC_state);
11664                     return ret;
11665                 }
11666                 break;
11667             case '|':           /* (?|...) */
11668                 /* branch reset, behave like a (?:...) except that
11669                    buffers in alternations share the same numbers */
11670                 paren = ':';
11671                 after_freeze = freeze_paren = RExC_npar;
11672
11673                 /* XXX This construct currently requires an extra pass.
11674                  * Investigation would be required to see if that could be
11675                  * changed */
11676                 REQUIRE_PARENS_PASS;
11677                 break;
11678             case ':':           /* (?:...) */
11679             case '>':           /* (?>...) */
11680                 break;
11681             case '$':           /* (?$...) */
11682             case '@':           /* (?@...) */
11683                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11684                 break;
11685             case '0' :           /* (?0) */
11686             case 'R' :           /* (?R) */
11687                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11688                     FAIL("Sequence (?R) not terminated");
11689                 num = 0;
11690                 RExC_seen |= REG_RECURSE_SEEN;
11691
11692                 /* XXX These constructs currently require an extra pass.
11693                  * It probably could be changed */
11694                 REQUIRE_PARENS_PASS;
11695
11696                 *flagp |= POSTPONED;
11697                 goto gen_recurse_regop;
11698                 /*notreached*/
11699             /* named and numeric backreferences */
11700             case '&':            /* (?&NAME) */
11701                 parse_start = RExC_parse - 1;
11702               named_recursion:
11703                 {
11704                     SV *sv_dat = reg_scan_name(pRExC_state,
11705                                                REG_RSN_RETURN_DATA);
11706                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11707                 }
11708                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11709                     vFAIL("Sequence (?&... not terminated");
11710                 goto gen_recurse_regop;
11711                 /* NOTREACHED */
11712             case '+':
11713                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11714                     RExC_parse++;
11715                     vFAIL("Illegal pattern");
11716                 }
11717                 goto parse_recursion;
11718                 /* NOTREACHED*/
11719             case '-': /* (?-1) */
11720                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11721                     RExC_parse--; /* rewind to let it be handled later */
11722                     goto parse_flags;
11723                 }
11724                 /* FALLTHROUGH */
11725             case '1': case '2': case '3': case '4': /* (?1) */
11726             case '5': case '6': case '7': case '8': case '9':
11727                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11728               parse_recursion:
11729                 {
11730                     bool is_neg = FALSE;
11731                     UV unum;
11732                     parse_start = RExC_parse - 1; /* MJD */
11733                     if (*RExC_parse == '-') {
11734                         RExC_parse++;
11735                         is_neg = TRUE;
11736                     }
11737                     endptr = RExC_end;
11738                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11739                         && unum <= I32_MAX
11740                     ) {
11741                         num = (I32)unum;
11742                         RExC_parse = (char*)endptr;
11743                     }
11744                     else {  /* Overflow, or something like that.  Position
11745                                beyond all digits for the message */
11746                         while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
11747                             RExC_parse++;
11748                         }
11749                         vFAIL(impossible_group);
11750                     }
11751                     if (is_neg) {
11752                         /* -num is always representable on 1 and 2's complement
11753                          * machines */
11754                         num = -num;
11755                     }
11756                 }
11757                 if (*RExC_parse!=')')
11758                     vFAIL("Expecting close bracket");
11759
11760               gen_recurse_regop:
11761                 if (paren == '-' || paren == '+') {
11762
11763                     /* Don't overflow */
11764                     if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11765                         RExC_parse++;
11766                         vFAIL(impossible_group);
11767                     }
11768
11769                     /*
11770                     Diagram of capture buffer numbering.
11771                     Top line is the normal capture buffer numbers
11772                     Bottom line is the negative indexing as from
11773                     the X (the (?-2))
11774
11775                         1 2    3 4 5 X   Y      6 7
11776                        /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11777                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11778                     -   5 4    3 2 1 X   Y      x x
11779
11780                     Resolve to absolute group.  Recall that RExC_npar is +1 of
11781                     the actual parenthesis group number.  For lookahead, we
11782                     have to compensate for that.  Using the above example, when
11783                     we get to Y in the parse, num is 2 and RExC_npar is 6.  We
11784                     want 7 for +2, and 4 for -2.
11785                     */
11786                     if ( paren == '+' ) {
11787                         num--;
11788                     }
11789
11790                     num += RExC_npar;
11791
11792                     if (paren == '-' && num < 1) {
11793                         RExC_parse++;
11794                         vFAIL(non_existent_group_msg);
11795                     }
11796                 }
11797
11798                 if (num >= RExC_npar) {
11799
11800                     /* It might be a forward reference; we can't fail until we
11801                      * know, by completing the parse to get all the groups, and
11802                      * then reparsing */
11803                     if (ALL_PARENS_COUNTED)  {
11804                         if (num >= RExC_total_parens) {
11805                             RExC_parse++;
11806                             vFAIL(non_existent_group_msg);
11807                         }
11808                     }
11809                     else {
11810                         REQUIRE_PARENS_PASS;
11811                     }
11812                 }
11813
11814                 /* We keep track how many GOSUB items we have produced.
11815                    To start off the ARG2L() of the GOSUB holds its "id",
11816                    which is used later in conjunction with RExC_recurse
11817                    to calculate the offset we need to jump for the GOSUB,
11818                    which it will store in the final representation.
11819                    We have to defer the actual calculation until much later
11820                    as the regop may move.
11821                  */
11822                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11823                 RExC_recurse_count++;
11824                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11825                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11826                             22, "|    |", (int)(depth * 2 + 1), "",
11827                             (UV)ARG(REGNODE_p(ret)),
11828                             (IV)ARG2L(REGNODE_p(ret))));
11829                 RExC_seen |= REG_RECURSE_SEEN;
11830
11831                 Set_Node_Length(REGNODE_p(ret),
11832                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11833                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11834
11835                 *flagp |= POSTPONED;
11836                 assert(*RExC_parse == ')');
11837                 nextchar(pRExC_state);
11838                 return ret;
11839
11840             /* NOTREACHED */
11841
11842             case '?':           /* (??...) */
11843                 is_logical = 1;
11844                 if (*RExC_parse != '{') {
11845                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11846                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11847                     vFAIL2utf8f(
11848                         "Sequence (%" UTF8f "...) not recognized",
11849                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11850                     NOT_REACHED; /*NOTREACHED*/
11851                 }
11852                 *flagp |= POSTPONED;
11853                 paren = '{';
11854                 RExC_parse++;
11855                 /* FALLTHROUGH */
11856             case '{':           /* (?{...}) */
11857             {
11858                 U32 n = 0;
11859                 struct reg_code_block *cb;
11860                 OP * o;
11861
11862                 RExC_seen_zerolen++;
11863
11864                 if (   !pRExC_state->code_blocks
11865                     || pRExC_state->code_index
11866                                         >= pRExC_state->code_blocks->count
11867                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11868                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11869                             - RExC_start)
11870                 ) {
11871                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11872                         FAIL("panic: Sequence (?{...}): no code block found\n");
11873                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11874                 }
11875                 /* this is a pre-compiled code block (?{...}) */
11876                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11877                 RExC_parse = RExC_start + cb->end;
11878                 o = cb->block;
11879                 if (cb->src_regex) {
11880                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11881                     RExC_rxi->data->data[n] =
11882                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11883                     RExC_rxi->data->data[n+1] = (void*)o;
11884                 }
11885                 else {
11886                     n = add_data(pRExC_state,
11887                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11888                     RExC_rxi->data->data[n] = (void*)o;
11889                 }
11890                 pRExC_state->code_index++;
11891                 nextchar(pRExC_state);
11892
11893                 if (is_logical) {
11894                     regnode_offset eval;
11895                     ret = reg_node(pRExC_state, LOGICAL);
11896
11897                     eval = reg2Lanode(pRExC_state, EVAL,
11898                                        n,
11899
11900                                        /* for later propagation into (??{})
11901                                         * return value */
11902                                        RExC_flags & RXf_PMf_COMPILETIME
11903                                       );
11904                     FLAGS(REGNODE_p(ret)) = 2;
11905                     if (! REGTAIL(pRExC_state, ret, eval)) {
11906                         REQUIRE_BRANCHJ(flagp, 0);
11907                     }
11908                     /* deal with the length of this later - MJD */
11909                     return ret;
11910                 }
11911                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11912                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11913                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11914                 return ret;
11915             }
11916             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11917             {
11918                 int is_define= 0;
11919                 const int DEFINE_len = sizeof("DEFINE") - 1;
11920                 if (    RExC_parse < RExC_end - 1
11921                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11922                             && (   RExC_parse[1] == '='
11923                                 || RExC_parse[1] == '!'
11924                                 || RExC_parse[1] == '<'
11925                                 || RExC_parse[1] == '{'))
11926                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11927                             && (   memBEGINs(RExC_parse + 1,
11928                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11929                                          "pla:")
11930                                 || memBEGINs(RExC_parse + 1,
11931                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11932                                          "plb:")
11933                                 || memBEGINs(RExC_parse + 1,
11934                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11935                                          "nla:")
11936                                 || memBEGINs(RExC_parse + 1,
11937                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11938                                          "nlb:")
11939                                 || memBEGINs(RExC_parse + 1,
11940                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11941                                          "positive_lookahead:")
11942                                 || memBEGINs(RExC_parse + 1,
11943                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11944                                          "positive_lookbehind:")
11945                                 || memBEGINs(RExC_parse + 1,
11946                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11947                                          "negative_lookahead:")
11948                                 || memBEGINs(RExC_parse + 1,
11949                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11950                                          "negative_lookbehind:"))))
11951                 ) { /* Lookahead or eval. */
11952                     I32 flag;
11953                     regnode_offset tail;
11954
11955                     ret = reg_node(pRExC_state, LOGICAL);
11956                     FLAGS(REGNODE_p(ret)) = 1;
11957
11958                     tail = reg(pRExC_state, 1, &flag, depth+1);
11959                     RETURN_FAIL_ON_RESTART(flag, flagp);
11960                     if (! REGTAIL(pRExC_state, ret, tail)) {
11961                         REQUIRE_BRANCHJ(flagp, 0);
11962                     }
11963                     goto insert_if;
11964                 }
11965                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11966                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11967                 {
11968                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11969                     char *name_start= RExC_parse++;
11970                     U32 num = 0;
11971                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11972                     if (   RExC_parse == name_start
11973                         || RExC_parse >= RExC_end
11974                         || *RExC_parse != ch)
11975                     {
11976                         vFAIL2("Sequence (?(%c... not terminated",
11977                             (ch == '>' ? '<' : ch));
11978                     }
11979                     RExC_parse++;
11980                     if (sv_dat) {
11981                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11982                         RExC_rxi->data->data[num]=(void*)sv_dat;
11983                         SvREFCNT_inc_simple_void_NN(sv_dat);
11984                     }
11985                     ret = reganode(pRExC_state, GROUPPN, num);
11986                     goto insert_if_check_paren;
11987                 }
11988                 else if (memBEGINs(RExC_parse,
11989                                    (STRLEN) (RExC_end - RExC_parse),
11990                                    "DEFINE"))
11991                 {
11992                     ret = reganode(pRExC_state, DEFINEP, 0);
11993                     RExC_parse += DEFINE_len;
11994                     is_define = 1;
11995                     goto insert_if_check_paren;
11996                 }
11997                 else if (RExC_parse[0] == 'R') {
11998                     RExC_parse++;
11999                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
12000                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
12001                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
12002                      */
12003                     parno = 0;
12004                     if (RExC_parse[0] == '0') {
12005                         parno = 1;
12006                         RExC_parse++;
12007                     }
12008                     else if (inRANGE(RExC_parse[0], '1', '9')) {
12009                         UV uv;
12010                         endptr = RExC_end;
12011                         if (grok_atoUV(RExC_parse, &uv, &endptr)
12012                             && uv <= I32_MAX
12013                         ) {
12014                             parno = (I32)uv + 1;
12015                             RExC_parse = (char*)endptr;
12016                         }
12017                         /* else "Switch condition not recognized" below */
12018                     } else if (RExC_parse[0] == '&') {
12019                         SV *sv_dat;
12020                         RExC_parse++;
12021                         sv_dat = reg_scan_name(pRExC_state,
12022                                                REG_RSN_RETURN_DATA);
12023                         if (sv_dat)
12024                             parno = 1 + *((I32 *)SvPVX(sv_dat));
12025                     }
12026                     ret = reganode(pRExC_state, INSUBP, parno);
12027                     goto insert_if_check_paren;
12028                 }
12029                 else if (inRANGE(RExC_parse[0], '1', '9')) {
12030                     /* (?(1)...) */
12031                     char c;
12032                     UV uv;
12033                     endptr = RExC_end;
12034                     if (grok_atoUV(RExC_parse, &uv, &endptr)
12035                         && uv <= I32_MAX
12036                     ) {
12037                         parno = (I32)uv;
12038                         RExC_parse = (char*)endptr;
12039                     }
12040                     else {
12041                         vFAIL("panic: grok_atoUV returned FALSE");
12042                     }
12043                     ret = reganode(pRExC_state, GROUPP, parno);
12044
12045                  insert_if_check_paren:
12046                     if (UCHARAT(RExC_parse) != ')') {
12047                         RExC_parse += UTF
12048                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12049                                       : 1;
12050                         vFAIL("Switch condition not recognized");
12051                     }
12052                     nextchar(pRExC_state);
12053                   insert_if:
12054                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12055                                                              IFTHEN, 0)))
12056                     {
12057                         REQUIRE_BRANCHJ(flagp, 0);
12058                     }
12059                     br = regbranch(pRExC_state, &flags, 1, depth+1);
12060                     if (br == 0) {
12061                         RETURN_FAIL_ON_RESTART(flags,flagp);
12062                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12063                               (UV) flags);
12064                     } else
12065                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12066                                                              LONGJMP, 0)))
12067                     {
12068                         REQUIRE_BRANCHJ(flagp, 0);
12069                     }
12070                     c = UCHARAT(RExC_parse);
12071                     nextchar(pRExC_state);
12072                     if (flags&HASWIDTH)
12073                         *flagp |= HASWIDTH;
12074                     if (c == '|') {
12075                         if (is_define)
12076                             vFAIL("(?(DEFINE)....) does not allow branches");
12077
12078                         /* Fake one for optimizer.  */
12079                         lastbr = reganode(pRExC_state, IFTHEN, 0);
12080
12081                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12082                             RETURN_FAIL_ON_RESTART(flags, flagp);
12083                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12084                                   (UV) flags);
12085                         }
12086                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
12087                             REQUIRE_BRANCHJ(flagp, 0);
12088                         }
12089                         if (flags&HASWIDTH)
12090                             *flagp |= HASWIDTH;
12091                         c = UCHARAT(RExC_parse);
12092                         nextchar(pRExC_state);
12093                     }
12094                     else
12095                         lastbr = 0;
12096                     if (c != ')') {
12097                         if (RExC_parse >= RExC_end)
12098                             vFAIL("Switch (?(condition)... not terminated");
12099                         else
12100                             vFAIL("Switch (?(condition)... contains too many branches");
12101                     }
12102                     ender = reg_node(pRExC_state, TAIL);
12103                     if (! REGTAIL(pRExC_state, br, ender)) {
12104                         REQUIRE_BRANCHJ(flagp, 0);
12105                     }
12106                     if (lastbr) {
12107                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12108                             REQUIRE_BRANCHJ(flagp, 0);
12109                         }
12110                         if (! REGTAIL(pRExC_state,
12111                                       REGNODE_OFFSET(
12112                                                  NEXTOPER(
12113                                                  NEXTOPER(REGNODE_p(lastbr)))),
12114                                       ender))
12115                         {
12116                             REQUIRE_BRANCHJ(flagp, 0);
12117                         }
12118                     }
12119                     else
12120                         if (! REGTAIL(pRExC_state, ret, ender)) {
12121                             REQUIRE_BRANCHJ(flagp, 0);
12122                         }
12123 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
12124                     RExC_size++; /* XXX WHY do we need this?!!
12125                                     For large programs it seems to be required
12126                                     but I can't figure out why. -- dmq*/
12127 #endif
12128                     return ret;
12129                 }
12130                 RExC_parse += UTF
12131                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12132                               : 1;
12133                 vFAIL("Unknown switch condition (?(...))");
12134             }
12135             case '[':           /* (?[ ... ]) */
12136                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12137                                          oregcomp_parse);
12138             case 0: /* A NUL */
12139                 RExC_parse--; /* for vFAIL to print correctly */
12140                 vFAIL("Sequence (? incomplete");
12141                 break;
12142
12143             case ')':
12144                 if (RExC_strict) {  /* [perl #132851] */
12145                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12146                 }
12147                 /* FALLTHROUGH */
12148             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12149             /* FALLTHROUGH */
12150             default: /* e.g., (?i) */
12151                 RExC_parse = (char *) seqstart + 1;
12152               parse_flags:
12153                 parse_lparen_question_flags(pRExC_state);
12154                 if (UCHARAT(RExC_parse) != ':') {
12155                     if (RExC_parse < RExC_end)
12156                         nextchar(pRExC_state);
12157                     *flagp = TRYAGAIN;
12158                     return 0;
12159                 }
12160                 paren = ':';
12161                 nextchar(pRExC_state);
12162                 ret = 0;
12163                 goto parse_rest;
12164             } /* end switch */
12165         }
12166         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12167           capturing_parens:
12168             parno = RExC_npar;
12169             RExC_npar++;
12170             if (! ALL_PARENS_COUNTED) {
12171                 /* If we are in our first pass through (and maybe only pass),
12172                  * we  need to allocate memory for the capturing parentheses
12173                  * data structures.
12174                  */
12175
12176                 if (!RExC_parens_buf_size) {
12177                     /* first guess at number of parens we might encounter */
12178                     RExC_parens_buf_size = 10;
12179
12180                     /* setup RExC_open_parens, which holds the address of each
12181                      * OPEN tag, and to make things simpler for the 0 index the
12182                      * start of the program - this is used later for offsets */
12183                     Newxz(RExC_open_parens, RExC_parens_buf_size,
12184                             regnode_offset);
12185                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12186
12187                     /* setup RExC_close_parens, which holds the address of each
12188                      * CLOSE tag, and to make things simpler for the 0 index
12189                      * the end of the program - this is used later for offsets
12190                      * */
12191                     Newxz(RExC_close_parens, RExC_parens_buf_size,
12192                             regnode_offset);
12193                     /* we dont know where end op starts yet, so we dont need to
12194                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12195                      * above */
12196                 }
12197                 else if (RExC_npar > RExC_parens_buf_size) {
12198                     I32 old_size = RExC_parens_buf_size;
12199
12200                     RExC_parens_buf_size *= 2;
12201
12202                     Renew(RExC_open_parens, RExC_parens_buf_size,
12203                             regnode_offset);
12204                     Zero(RExC_open_parens + old_size,
12205                             RExC_parens_buf_size - old_size, regnode_offset);
12206
12207                     Renew(RExC_close_parens, RExC_parens_buf_size,
12208                             regnode_offset);
12209                     Zero(RExC_close_parens + old_size,
12210                             RExC_parens_buf_size - old_size, regnode_offset);
12211                 }
12212             }
12213
12214             ret = reganode(pRExC_state, OPEN, parno);
12215             if (!RExC_nestroot)
12216                 RExC_nestroot = parno;
12217             if (RExC_open_parens && !RExC_open_parens[parno])
12218             {
12219                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12220                     "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12221                     22, "|    |", (int)(depth * 2 + 1), "",
12222                     (IV)parno, ret));
12223                 RExC_open_parens[parno]= ret;
12224             }
12225
12226             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12227             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12228             is_open = 1;
12229         } else {
12230             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12231             paren = ':';
12232             ret = 0;
12233         }
12234     }
12235     else                        /* ! paren */
12236         ret = 0;
12237
12238    parse_rest:
12239     /* Pick up the branches, linking them together. */
12240     parse_start = RExC_parse;   /* MJD */
12241     br = regbranch(pRExC_state, &flags, 1, depth+1);
12242
12243     /*     branch_len = (paren != 0); */
12244
12245     if (br == 0) {
12246         RETURN_FAIL_ON_RESTART(flags, flagp);
12247         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12248     }
12249     if (*RExC_parse == '|') {
12250         if (RExC_use_BRANCHJ) {
12251             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12252         }
12253         else {                  /* MJD */
12254             reginsert(pRExC_state, BRANCH, br, depth+1);
12255             Set_Node_Length(REGNODE_p(br), paren != 0);
12256             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12257         }
12258         have_branch = 1;
12259     }
12260     else if (paren == ':') {
12261         *flagp |= flags&SIMPLE;
12262     }
12263     if (is_open) {                              /* Starts with OPEN. */
12264         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12265             REQUIRE_BRANCHJ(flagp, 0);
12266         }
12267     }
12268     else if (paren != '?')              /* Not Conditional */
12269         ret = br;
12270     *flagp |= flags & (HASWIDTH | POSTPONED);
12271     lastbr = br;
12272     while (*RExC_parse == '|') {
12273         if (RExC_use_BRANCHJ) {
12274             bool shut_gcc_up;
12275
12276             ender = reganode(pRExC_state, LONGJMP, 0);
12277
12278             /* Append to the previous. */
12279             shut_gcc_up = REGTAIL(pRExC_state,
12280                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12281                          ender);
12282             PERL_UNUSED_VAR(shut_gcc_up);
12283         }
12284         nextchar(pRExC_state);
12285         if (freeze_paren) {
12286             if (RExC_npar > after_freeze)
12287                 after_freeze = RExC_npar;
12288             RExC_npar = freeze_paren;
12289         }
12290         br = regbranch(pRExC_state, &flags, 0, depth+1);
12291
12292         if (br == 0) {
12293             RETURN_FAIL_ON_RESTART(flags, flagp);
12294             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12295         }
12296         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12297             REQUIRE_BRANCHJ(flagp, 0);
12298         }
12299         lastbr = br;
12300         *flagp |= flags & (HASWIDTH | POSTPONED);
12301     }
12302
12303     if (have_branch || paren != ':') {
12304         regnode * br;
12305
12306         /* Make a closing node, and hook it on the end. */
12307         switch (paren) {
12308         case ':':
12309             ender = reg_node(pRExC_state, TAIL);
12310             break;
12311         case 1: case 2:
12312             ender = reganode(pRExC_state, CLOSE, parno);
12313             if ( RExC_close_parens ) {
12314                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12315                         "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12316                         22, "|    |", (int)(depth * 2 + 1), "",
12317                         (IV)parno, ender));
12318                 RExC_close_parens[parno]= ender;
12319                 if (RExC_nestroot == parno)
12320                     RExC_nestroot = 0;
12321             }
12322             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12323             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12324             break;
12325         case 's':
12326             ender = reg_node(pRExC_state, SRCLOSE);
12327             RExC_in_script_run = 0;
12328             break;
12329         case '<':
12330         case 'a':
12331         case 'A':
12332         case 'b':
12333         case 'B':
12334         case ',':
12335         case '=':
12336         case '!':
12337             *flagp &= ~HASWIDTH;
12338             /* FALLTHROUGH */
12339         case 't':   /* aTomic */
12340         case '>':
12341             ender = reg_node(pRExC_state, SUCCEED);
12342             break;
12343         case 0:
12344             ender = reg_node(pRExC_state, END);
12345             assert(!RExC_end_op); /* there can only be one! */
12346             RExC_end_op = REGNODE_p(ender);
12347             if (RExC_close_parens) {
12348                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12349                     "%*s%*s Setting close paren #0 (END) to %zu\n",
12350                     22, "|    |", (int)(depth * 2 + 1), "",
12351                     ender));
12352
12353                 RExC_close_parens[0]= ender;
12354             }
12355             break;
12356         }
12357         DEBUG_PARSE_r({
12358             DEBUG_PARSE_MSG("lsbr");
12359             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12360             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12361             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12362                           SvPV_nolen_const(RExC_mysv1),
12363                           (IV)lastbr,
12364                           SvPV_nolen_const(RExC_mysv2),
12365                           (IV)ender,
12366                           (IV)(ender - lastbr)
12367             );
12368         });
12369         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12370             REQUIRE_BRANCHJ(flagp, 0);
12371         }
12372
12373         if (have_branch) {
12374             char is_nothing= 1;
12375             if (depth==1)
12376                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12377
12378             /* Hook the tails of the branches to the closing node. */
12379             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12380                 const U8 op = PL_regkind[OP(br)];
12381                 if (op == BRANCH) {
12382                     if (! REGTAIL_STUDY(pRExC_state,
12383                                         REGNODE_OFFSET(NEXTOPER(br)),
12384                                         ender))
12385                     {
12386                         REQUIRE_BRANCHJ(flagp, 0);
12387                     }
12388                     if ( OP(NEXTOPER(br)) != NOTHING
12389                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12390                         is_nothing= 0;
12391                 }
12392                 else if (op == BRANCHJ) {
12393                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12394                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12395                                         ender);
12396                     PERL_UNUSED_VAR(shut_gcc_up);
12397                     /* for now we always disable this optimisation * /
12398                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12399                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12400                     */
12401                         is_nothing= 0;
12402                 }
12403             }
12404             if (is_nothing) {
12405                 regnode * ret_as_regnode = REGNODE_p(ret);
12406                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12407                                ? regnext(ret_as_regnode)
12408                                : ret_as_regnode;
12409                 DEBUG_PARSE_r({
12410                     DEBUG_PARSE_MSG("NADA");
12411                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12412                                      NULL, pRExC_state);
12413                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12414                                      NULL, pRExC_state);
12415                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12416                                   SvPV_nolen_const(RExC_mysv1),
12417                                   (IV)REG_NODE_NUM(ret_as_regnode),
12418                                   SvPV_nolen_const(RExC_mysv2),
12419                                   (IV)ender,
12420                                   (IV)(ender - ret)
12421                     );
12422                 });
12423                 OP(br)= NOTHING;
12424                 if (OP(REGNODE_p(ender)) == TAIL) {
12425                     NEXT_OFF(br)= 0;
12426                     RExC_emit= REGNODE_OFFSET(br) + 1;
12427                 } else {
12428                     regnode *opt;
12429                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12430                         OP(opt)= OPTIMIZED;
12431                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12432                 }
12433             }
12434         }
12435     }
12436
12437     {
12438         const char *p;
12439          /* Even/odd or x=don't care: 010101x10x */
12440         static const char parens[] = "=!aA<,>Bbt";
12441          /* flag below is set to 0 up through 'A'; 1 for larger */
12442
12443         if (paren && (p = strchr(parens, paren))) {
12444             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12445             int flag = (p - parens) > 3;
12446
12447             if (paren == '>' || paren == 't') {
12448                 node = SUSPEND, flag = 0;
12449             }
12450
12451             reginsert(pRExC_state, node, ret, depth+1);
12452             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12453             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12454             FLAGS(REGNODE_p(ret)) = flag;
12455             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12456             {
12457                 REQUIRE_BRANCHJ(flagp, 0);
12458             }
12459         }
12460     }
12461
12462     /* Check for proper termination. */
12463     if (paren) {
12464         /* restore original flags, but keep (?p) and, if we've encountered
12465          * something in the parse that changes /d rules into /u, keep the /u */
12466         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12467         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12468             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12469         }
12470         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12471             RExC_parse = oregcomp_parse;
12472             vFAIL("Unmatched (");
12473         }
12474         nextchar(pRExC_state);
12475     }
12476     else if (!paren && RExC_parse < RExC_end) {
12477         if (*RExC_parse == ')') {
12478             RExC_parse++;
12479             vFAIL("Unmatched )");
12480         }
12481         else
12482             FAIL("Junk on end of regexp");      /* "Can't happen". */
12483         NOT_REACHED; /* NOTREACHED */
12484     }
12485
12486     if (RExC_in_lookbehind) {
12487         RExC_in_lookbehind--;
12488     }
12489     if (RExC_in_lookahead) {
12490         RExC_in_lookahead--;
12491     }
12492     if (after_freeze > RExC_npar)
12493         RExC_npar = after_freeze;
12494     return(ret);
12495 }
12496
12497 /*
12498  - regbranch - one alternative of an | operator
12499  *
12500  * Implements the concatenation operator.
12501  *
12502  * On success, returns the offset at which any next node should be placed into
12503  * the regex engine program being compiled.
12504  *
12505  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12506  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12507  * UTF-8
12508  */
12509 STATIC regnode_offset
12510 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12511 {
12512     regnode_offset ret;
12513     regnode_offset chain = 0;
12514     regnode_offset latest;
12515     I32 flags = 0, c = 0;
12516     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12517
12518     PERL_ARGS_ASSERT_REGBRANCH;
12519
12520     DEBUG_PARSE("brnc");
12521
12522     if (first)
12523         ret = 0;
12524     else {
12525         if (RExC_use_BRANCHJ)
12526             ret = reganode(pRExC_state, BRANCHJ, 0);
12527         else {
12528             ret = reg_node(pRExC_state, BRANCH);
12529             Set_Node_Length(REGNODE_p(ret), 1);
12530         }
12531     }
12532
12533     *flagp = 0;                 /* Initialize. */
12534
12535     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12536                             FALSE /* Don't force to /x */ );
12537     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12538         flags &= ~TRYAGAIN;
12539         latest = regpiece(pRExC_state, &flags, depth+1);
12540         if (latest == 0) {
12541             if (flags & TRYAGAIN)
12542                 continue;
12543             RETURN_FAIL_ON_RESTART(flags, flagp);
12544             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12545         }
12546         else if (ret == 0)
12547             ret = latest;
12548         *flagp |= flags&(HASWIDTH|POSTPONED);
12549         if (chain != 0) {
12550             /* FIXME adding one for every branch after the first is probably
12551              * excessive now we have TRIE support. (hv) */
12552             MARK_NAUGHTY(1);
12553             if (! REGTAIL(pRExC_state, chain, latest)) {
12554                 /* XXX We could just redo this branch, but figuring out what
12555                  * bookkeeping needs to be reset is a pain, and it's likely
12556                  * that other branches that goto END will also be too large */
12557                 REQUIRE_BRANCHJ(flagp, 0);
12558             }
12559         }
12560         chain = latest;
12561         c++;
12562     }
12563     if (chain == 0) {   /* Loop ran zero times. */
12564         chain = reg_node(pRExC_state, NOTHING);
12565         if (ret == 0)
12566             ret = chain;
12567     }
12568     if (c == 1) {
12569         *flagp |= flags&SIMPLE;
12570     }
12571
12572     return ret;
12573 }
12574
12575 /*
12576  - regpiece - something followed by possible quantifier * + ? {n,m}
12577  *
12578  * Note that the branching code sequences used for ? and the general cases
12579  * of * and + are somewhat optimized:  they use the same NOTHING node as
12580  * both the endmarker for their branch list and the body of the last branch.
12581  * It might seem that this node could be dispensed with entirely, but the
12582  * endmarker role is not redundant.
12583  *
12584  * On success, returns the offset at which any next node should be placed into
12585  * the regex engine program being compiled.
12586  *
12587  * Returns 0 otherwise, with *flagp set to indicate why:
12588  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12589  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12590  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12591  */
12592 STATIC regnode_offset
12593 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12594 {
12595     regnode_offset ret;
12596     char op;
12597     char *next;
12598     I32 flags;
12599     const char * const origparse = RExC_parse;
12600     I32 min;
12601     I32 max = REG_INFTY;
12602 #ifdef RE_TRACK_PATTERN_OFFSETS
12603     char *parse_start;
12604 #endif
12605     const char *maxpos = NULL;
12606     UV uv;
12607
12608     /* Save the original in case we change the emitted regop to a FAIL. */
12609     const regnode_offset orig_emit = RExC_emit;
12610
12611     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12612
12613     PERL_ARGS_ASSERT_REGPIECE;
12614
12615     DEBUG_PARSE("piec");
12616
12617     ret = regatom(pRExC_state, &flags, depth+1);
12618     if (ret == 0) {
12619         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12620         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12621     }
12622
12623     op = *RExC_parse;
12624
12625     if (op == '{' && regcurly(RExC_parse)) {
12626         maxpos = NULL;
12627 #ifdef RE_TRACK_PATTERN_OFFSETS
12628         parse_start = RExC_parse; /* MJD */
12629 #endif
12630         next = RExC_parse + 1;
12631         while (isDIGIT(*next) || *next == ',') {
12632             if (*next == ',') {
12633                 if (maxpos)
12634                     break;
12635                 else
12636                     maxpos = next;
12637             }
12638             next++;
12639         }
12640         if (*next == '}') {             /* got one */
12641             const char* endptr;
12642             if (!maxpos)
12643                 maxpos = next;
12644             RExC_parse++;
12645             if (isDIGIT(*RExC_parse)) {
12646                 endptr = RExC_end;
12647                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12648                     vFAIL("Invalid quantifier in {,}");
12649                 if (uv >= REG_INFTY)
12650                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12651                 min = (I32)uv;
12652             } else {
12653                 min = 0;
12654             }
12655             if (*maxpos == ',')
12656                 maxpos++;
12657             else
12658                 maxpos = RExC_parse;
12659             if (isDIGIT(*maxpos)) {
12660                 endptr = RExC_end;
12661                 if (!grok_atoUV(maxpos, &uv, &endptr))
12662                     vFAIL("Invalid quantifier in {,}");
12663                 if (uv >= REG_INFTY)
12664                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12665                 max = (I32)uv;
12666             } else {
12667                 max = REG_INFTY;                /* meaning "infinity" */
12668             }
12669             RExC_parse = next;
12670             nextchar(pRExC_state);
12671             if (max < min) {    /* If can't match, warn and optimize to fail
12672                                    unconditionally */
12673                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12674                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12675                 NEXT_OFF(REGNODE_p(orig_emit)) =
12676                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12677                 return ret;
12678             }
12679             else if (min == max && *RExC_parse == '?')
12680             {
12681                 ckWARN2reg(RExC_parse + 1,
12682                            "Useless use of greediness modifier '%c'",
12683                            *RExC_parse);
12684             }
12685
12686           do_curly:
12687             if ((flags&SIMPLE)) {
12688                 if (min == 0 && max == REG_INFTY) {
12689
12690                     /* Going from 0..inf is currently forbidden in wildcard
12691                      * subpatterns.  The only reason is to make it harder to
12692                      * write patterns that take a long long time to halt, and
12693                      * because the use of this construct isn't necessary in
12694                      * matching Unicode property values */
12695                     if (RExC_pm_flags & PMf_WILDCARD) {
12696                         RExC_parse++;
12697                         /* diag_listed_as: Use of %s is not allowed in Unicode
12698                            property wildcard subpatterns in regex; marked by
12699                            <-- HERE in m/%s/ */
12700                         vFAIL("Use of quantifier '*' is not allowed in"
12701                               " Unicode property wildcard subpatterns");
12702                         /* Note, don't need to worry about {0,}, as a '}' isn't
12703                          * legal at all in wildcards, so wouldn't get this far
12704                          * */
12705                     }
12706                     reginsert(pRExC_state, STAR, ret, depth+1);
12707                     MARK_NAUGHTY(4);
12708                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12709                     goto nest_check;
12710                 }
12711                 if (min == 1 && max == REG_INFTY) {
12712                     reginsert(pRExC_state, PLUS, ret, depth+1);
12713                     MARK_NAUGHTY(3);
12714                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12715                     goto nest_check;
12716                 }
12717                 MARK_NAUGHTY_EXP(2, 2);
12718                 reginsert(pRExC_state, CURLY, ret, depth+1);
12719                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12720                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12721             }
12722             else {
12723                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12724
12725                 FLAGS(REGNODE_p(w)) = 0;
12726                 if (!  REGTAIL(pRExC_state, ret, w)) {
12727                     REQUIRE_BRANCHJ(flagp, 0);
12728                 }
12729                 if (RExC_use_BRANCHJ) {
12730                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
12731                     reginsert(pRExC_state, NOTHING, ret, depth+1);
12732                     NEXT_OFF(REGNODE_p(ret)) = 3;       /* Go over LONGJMP. */
12733                 }
12734                 reginsert(pRExC_state, CURLYX, ret, depth+1);
12735                                 /* MJD hk */
12736                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12737                 Set_Node_Length(REGNODE_p(ret),
12738                                 op == '{' ? (RExC_parse - parse_start) : 1);
12739
12740                 if (RExC_use_BRANCHJ)
12741                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12742                                                        LONGJMP. */
12743                 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12744                                                           NOTHING)))
12745                 {
12746                     REQUIRE_BRANCHJ(flagp, 0);
12747                 }
12748                 RExC_whilem_seen++;
12749                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12750             }
12751             FLAGS(REGNODE_p(ret)) = 0;
12752
12753             if (min > 0)
12754                 *flagp = 0;
12755             if (max > 0)
12756                 *flagp |= HASWIDTH;
12757             ARG1_SET(REGNODE_p(ret), (U16)min);
12758             ARG2_SET(REGNODE_p(ret), (U16)max);
12759             if (max == REG_INFTY)
12760                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12761
12762             goto nest_check;
12763         }
12764     }
12765
12766     if (!ISMULT1(op)) {
12767         *flagp = flags;
12768         return(ret);
12769     }
12770
12771 #if 0                           /* Now runtime fix should be reliable. */
12772
12773     /* if this is reinstated, don't forget to put this back into perldiag:
12774
12775             =item Regexp *+ operand could be empty at {#} in regex m/%s/
12776
12777            (F) The part of the regexp subject to either the * or + quantifier
12778            could match an empty string. The {#} shows in the regular
12779            expression about where the problem was discovered.
12780
12781     */
12782
12783     if (!(flags&HASWIDTH) && op != '?')
12784       vFAIL("Regexp *+ operand could be empty");
12785 #endif
12786
12787 #ifdef RE_TRACK_PATTERN_OFFSETS
12788     parse_start = RExC_parse;
12789 #endif
12790     nextchar(pRExC_state);
12791
12792     *flagp = HASWIDTH;
12793
12794     if (op == '*') {
12795         min = 0;
12796         goto do_curly;
12797     }
12798     else if (op == '+') {
12799         min = 1;
12800         goto do_curly;
12801     }
12802     else if (op == '?') {
12803         min = 0; max = 1;
12804         goto do_curly;
12805     }
12806   nest_check:
12807     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12808         if (origparse[0] == '\\' && origparse[1] == 'K') {
12809             vFAIL2utf8f(
12810                        "%" UTF8f " is forbidden - matches null string many times",
12811                        UTF8fARG(UTF, (RExC_parse >= origparse
12812                                      ? RExC_parse - origparse
12813                                      : 0),
12814                        origparse));
12815             /* NOT-REACHED */
12816         } else {
12817             ckWARN2reg(RExC_parse,
12818                        "%" UTF8f " matches null string many times",
12819                        UTF8fARG(UTF, (RExC_parse >= origparse
12820                                      ? RExC_parse - origparse
12821                                      : 0),
12822                        origparse));
12823         }
12824     }
12825
12826     if (*RExC_parse == '?') {
12827         nextchar(pRExC_state);
12828         reginsert(pRExC_state, MINMOD, ret, depth+1);
12829         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12830             REQUIRE_BRANCHJ(flagp, 0);
12831         }
12832     }
12833     else if (*RExC_parse == '+') {
12834         regnode_offset ender;
12835         nextchar(pRExC_state);
12836         ender = reg_node(pRExC_state, SUCCEED);
12837         if (! REGTAIL(pRExC_state, ret, ender)) {
12838             REQUIRE_BRANCHJ(flagp, 0);
12839         }
12840         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12841         ender = reg_node(pRExC_state, TAIL);
12842         if (! REGTAIL(pRExC_state, ret, ender)) {
12843             REQUIRE_BRANCHJ(flagp, 0);
12844         }
12845     }
12846
12847     if (ISMULT2(RExC_parse)) {
12848         RExC_parse++;
12849         vFAIL("Nested quantifiers");
12850     }
12851
12852     return(ret);
12853 }
12854
12855 STATIC bool
12856 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12857                 regnode_offset * node_p,
12858                 UV * code_point_p,
12859                 int * cp_count,
12860                 I32 * flagp,
12861                 const bool strict,
12862                 const U32 depth
12863     )
12864 {
12865  /* This routine teases apart the various meanings of \N and returns
12866   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12867   * in the current context.
12868   *
12869   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12870   *
12871   * If <code_point_p> is not NULL, the context is expecting the result to be a
12872   * single code point.  If this \N instance turns out to a single code point,
12873   * the function returns TRUE and sets *code_point_p to that code point.
12874   *
12875   * If <node_p> is not NULL, the context is expecting the result to be one of
12876   * the things representable by a regnode.  If this \N instance turns out to be
12877   * one such, the function generates the regnode, returns TRUE and sets *node_p
12878   * to point to the offset of that regnode into the regex engine program being
12879   * compiled.
12880   *
12881   * If this instance of \N isn't legal in any context, this function will
12882   * generate a fatal error and not return.
12883   *
12884   * On input, RExC_parse should point to the first char following the \N at the
12885   * time of the call.  On successful return, RExC_parse will have been updated
12886   * to point to just after the sequence identified by this routine.  Also
12887   * *flagp has been updated as needed.
12888   *
12889   * When there is some problem with the current context and this \N instance,
12890   * the function returns FALSE, without advancing RExC_parse, nor setting
12891   * *node_p, nor *code_point_p, nor *flagp.
12892   *
12893   * If <cp_count> is not NULL, the caller wants to know the length (in code
12894   * points) that this \N sequence matches.  This is set, and the input is
12895   * parsed for errors, even if the function returns FALSE, as detailed below.
12896   *
12897   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12898   *
12899   * Probably the most common case is for the \N to specify a single code point.
12900   * *cp_count will be set to 1, and *code_point_p will be set to that code
12901   * point.
12902   *
12903   * Another possibility is for the input to be an empty \N{}.  This is no
12904   * longer accepted, and will generate a fatal error.
12905   *
12906   * Another possibility is for a custom charnames handler to be in effect which
12907   * translates the input name to an empty string.  *cp_count will be set to 0.
12908   * *node_p will be set to a generated NOTHING node.
12909   *
12910   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12911   * set to 0. *node_p will be set to a generated REG_ANY node.
12912   *
12913   * The fifth possibility is that \N resolves to a sequence of more than one
12914   * code points.  *cp_count will be set to the number of code points in the
12915   * sequence. *node_p will be set to a generated node returned by this
12916   * function calling S_reg().
12917   *
12918   * The final possibility is that it is premature to be calling this function;
12919   * the parse needs to be restarted.  This can happen when this changes from
12920   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12921   * latter occurs only when the fifth possibility would otherwise be in
12922   * effect, and is because one of those code points requires the pattern to be
12923   * recompiled as UTF-8.  The function returns FALSE, and sets the
12924   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12925   * happens, the caller needs to desist from continuing parsing, and return
12926   * this information to its caller.  This is not set for when there is only one
12927   * code point, as this can be called as part of an ANYOF node, and they can
12928   * store above-Latin1 code points without the pattern having to be in UTF-8.
12929   *
12930   * For non-single-quoted regexes, the tokenizer has resolved character and
12931   * sequence names inside \N{...} into their Unicode values, normalizing the
12932   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12933   * hex-represented code points in the sequence.  This is done there because
12934   * the names can vary based on what charnames pragma is in scope at the time,
12935   * so we need a way to take a snapshot of what they resolve to at the time of
12936   * the original parse. [perl #56444].
12937   *
12938   * That parsing is skipped for single-quoted regexes, so here we may get
12939   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12940   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12941   * the native character set for non-ASCII platforms.  The other possibilities
12942   * are already native, so no translation is done. */
12943
12944     char * endbrace;    /* points to '}' following the name */
12945     char* p = RExC_parse; /* Temporary */
12946
12947     SV * substitute_parse = NULL;
12948     char *orig_end;
12949     char *save_start;
12950     I32 flags;
12951
12952     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12953
12954     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12955
12956     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12957     assert(! (node_p && cp_count));               /* At most 1 should be set */
12958
12959     if (cp_count) {     /* Initialize return for the most common case */
12960         *cp_count = 1;
12961     }
12962
12963     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12964      * modifier.  The other meanings do not, so use a temporary until we find
12965      * out which we are being called with */
12966     skip_to_be_ignored_text(pRExC_state, &p,
12967                             FALSE /* Don't force to /x */ );
12968
12969     /* Disambiguate between \N meaning a named character versus \N meaning
12970      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12971      * quantifier, or if there is no '{' at all */
12972     if (*p != '{' || regcurly(p)) {
12973         RExC_parse = p;
12974         if (cp_count) {
12975             *cp_count = -1;
12976         }
12977
12978         if (! node_p) {
12979             return FALSE;
12980         }
12981
12982         *node_p = reg_node(pRExC_state, REG_ANY);
12983         *flagp |= HASWIDTH|SIMPLE;
12984         MARK_NAUGHTY(1);
12985         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12986         return TRUE;
12987     }
12988
12989     /* The test above made sure that the next real character is a '{', but
12990      * under the /x modifier, it could be separated by space (or a comment and
12991      * \n) and this is not allowed (for consistency with \x{...} and the
12992      * tokenizer handling of \N{NAME}). */
12993     if (*RExC_parse != '{') {
12994         vFAIL("Missing braces on \\N{}");
12995     }
12996
12997     RExC_parse++;       /* Skip past the '{' */
12998
12999     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13000     if (! endbrace) { /* no trailing brace */
13001         vFAIL2("Missing right brace on \\%c{}", 'N');
13002     }
13003
13004     /* Here, we have decided it should be a named character or sequence.  These
13005      * imply Unicode semantics */
13006     REQUIRE_UNI_RULES(flagp, FALSE);
13007
13008     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13009      * nothing at all (not allowed under strict) */
13010     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13011         RExC_parse = endbrace;
13012         if (strict) {
13013             RExC_parse++;   /* Position after the "}" */
13014             vFAIL("Zero length \\N{}");
13015         }
13016
13017         if (cp_count) {
13018             *cp_count = 0;
13019         }
13020         nextchar(pRExC_state);
13021         if (! node_p) {
13022             return FALSE;
13023         }
13024
13025         *node_p = reg_node(pRExC_state, NOTHING);
13026         return TRUE;
13027     }
13028
13029     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13030
13031         /* Here, the name isn't of the form  U+....  This can happen if the
13032          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
13033          * is the time to find out what the name means */
13034
13035         const STRLEN name_len = endbrace - RExC_parse;
13036         SV *  value_sv;     /* What does this name evaluate to */
13037         SV ** value_svp;
13038         const U8 * value;   /* string of name's value */
13039         STRLEN value_len;   /* and its length */
13040
13041         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
13042          *  toke.c, and their values. Make sure is initialized */
13043         if (! RExC_unlexed_names) {
13044             RExC_unlexed_names = newHV();
13045         }
13046
13047         /* If we have already seen this name in this pattern, use that.  This
13048          * allows us to only call the charnames handler once per name per
13049          * pattern.  A broken or malicious handler could return something
13050          * different each time, which could cause the results to vary depending
13051          * on if something gets added or subtracted from the pattern that
13052          * causes the number of passes to change, for example */
13053         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13054                                                       name_len, 0)))
13055         {
13056             value_sv = *value_svp;
13057         }
13058         else { /* Otherwise we have to go out and get the name */
13059             const char * error_msg = NULL;
13060             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13061                                                       UTF,
13062                                                       &error_msg);
13063             if (error_msg) {
13064                 RExC_parse = endbrace;
13065                 vFAIL(error_msg);
13066             }
13067
13068             /* If no error message, should have gotten a valid return */
13069             assert (value_sv);
13070
13071             /* Save the name's meaning for later use */
13072             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13073                            value_sv, 0))
13074             {
13075                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13076             }
13077         }
13078
13079         /* Here, we have the value the name evaluates to in 'value_sv' */
13080         value = (U8 *) SvPV(value_sv, value_len);
13081
13082         /* See if the result is one code point vs 0 or multiple */
13083         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13084                                   ? UTF8SKIP(value)
13085                                   : 1)))
13086         {
13087             /* Here, exactly one code point.  If that isn't what is wanted,
13088              * fail */
13089             if (! code_point_p) {
13090                 RExC_parse = p;
13091                 return FALSE;
13092             }
13093
13094             /* Convert from string to numeric code point */
13095             *code_point_p = (SvUTF8(value_sv))
13096                             ? valid_utf8_to_uvchr(value, NULL)
13097                             : *value;
13098
13099             /* Have parsed this entire single code point \N{...}.  *cp_count
13100              * has already been set to 1, so don't do it again. */
13101             RExC_parse = endbrace;
13102             nextchar(pRExC_state);
13103             return TRUE;
13104         } /* End of is a single code point */
13105
13106         /* Count the code points, if caller desires.  The API says to do this
13107          * even if we will later return FALSE */
13108         if (cp_count) {
13109             *cp_count = 0;
13110
13111             *cp_count = (SvUTF8(value_sv))
13112                         ? utf8_length(value, value + value_len)
13113                         : value_len;
13114         }
13115
13116         /* Fail if caller doesn't want to handle a multi-code-point sequence.
13117          * But don't back the pointer up if the caller wants to know how many
13118          * code points there are (they need to handle it themselves in this
13119          * case).  */
13120         if (! node_p) {
13121             if (! cp_count) {
13122                 RExC_parse = p;
13123             }
13124             return FALSE;
13125         }
13126
13127         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13128          * reg recursively to parse it.  That way, it retains its atomicness,
13129          * while not having to worry about any special handling that some code
13130          * points may have. */
13131
13132         substitute_parse = newSVpvs("?:");
13133         sv_catsv(substitute_parse, value_sv);
13134         sv_catpv(substitute_parse, ")");
13135
13136         /* The value should already be native, so no need to convert on EBCDIC
13137          * platforms.*/
13138         assert(! RExC_recode_x_to_native);
13139
13140     }
13141     else {   /* \N{U+...} */
13142         Size_t count = 0;   /* code point count kept internally */
13143
13144         /* We can get to here when the input is \N{U+...} or when toke.c has
13145          * converted a name to the \N{U+...} form.  This include changing a
13146          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13147
13148         RExC_parse += 2;    /* Skip past the 'U+' */
13149
13150         /* Code points are separated by dots.  The '}' terminates the whole
13151          * thing. */
13152
13153         do {    /* Loop until the ending brace */
13154             I32 flags = PERL_SCAN_SILENT_OVERFLOW
13155                       | PERL_SCAN_SILENT_ILLDIGIT
13156                       | PERL_SCAN_NOTIFY_ILLDIGIT
13157                       | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13158                       | PERL_SCAN_DISALLOW_PREFIX;
13159             STRLEN len = endbrace - RExC_parse;
13160             NV overflow_value;
13161             char * start_digit = RExC_parse;
13162             UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13163
13164             if (len == 0) {
13165                 RExC_parse++;
13166               bad_NU:
13167                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13168             }
13169
13170             RExC_parse += len;
13171
13172             if (cp > MAX_LEGAL_CP) {
13173                 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13174             }
13175
13176             if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13177                 if (count) {
13178                     goto do_concat;
13179                 }
13180
13181                 /* Here, is a single code point; fail if doesn't want that */
13182                 if (! code_point_p) {
13183                     RExC_parse = p;
13184                     return FALSE;
13185                 }
13186
13187                 /* A single code point is easy to handle; just return it */
13188                 *code_point_p = UNI_TO_NATIVE(cp);
13189                 RExC_parse = endbrace;
13190                 nextchar(pRExC_state);
13191                 return TRUE;
13192             }
13193
13194             /* Here, the parse stopped bfore the ending brace.  This is legal
13195              * only if that character is a dot separating code points, like a
13196              * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13197              * So the next character must be a dot (and the one after that
13198              * can't be the endbrace, or we'd have something like \N{U+100.} )
13199              * */
13200             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13201                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13202                               ? UTF8SKIP(RExC_parse)
13203                               : 1;
13204                 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13205                                                           malformed utf8 */
13206                 goto bad_NU;
13207             }
13208
13209             /* Here, looks like its really a multiple character sequence.  Fail
13210              * if that's not what the caller wants.  But continue with counting
13211              * and error checking if they still want a count */
13212             if (! node_p && ! cp_count) {
13213                 return FALSE;
13214             }
13215
13216             /* What is done here is to convert this to a sub-pattern of the
13217              * form \x{char1}\x{char2}...  and then call reg recursively to
13218              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13219              * atomicness, while not having to worry about special handling
13220              * that some code points may have.  We don't create a subpattern,
13221              * but go through the motions of code point counting and error
13222              * checking, if the caller doesn't want a node returned. */
13223
13224             if (node_p && ! substitute_parse) {
13225                 substitute_parse = newSVpvs("?:");
13226             }
13227
13228           do_concat:
13229
13230             if (node_p) {
13231                 /* Convert to notation the rest of the code understands */
13232                 sv_catpvs(substitute_parse, "\\x{");
13233                 sv_catpvn(substitute_parse, start_digit,
13234                                             RExC_parse - start_digit);
13235                 sv_catpvs(substitute_parse, "}");
13236             }
13237
13238             /* Move to after the dot (or ending brace the final time through.)
13239              * */
13240             RExC_parse++;
13241             count++;
13242
13243         } while (RExC_parse < endbrace);
13244
13245         if (! node_p) { /* Doesn't want the node */
13246             assert (cp_count);
13247
13248             *cp_count = count;
13249             return FALSE;
13250         }
13251
13252         sv_catpvs(substitute_parse, ")");
13253
13254         /* The values are Unicode, and therefore have to be converted to native
13255          * on a non-Unicode (meaning non-ASCII) platform. */
13256         SET_recode_x_to_native(1);
13257     }
13258
13259     /* Here, we have the string the name evaluates to, ready to be parsed,
13260      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13261      * constructs.  This can be called from within a substitute parse already.
13262      * The error reporting mechanism doesn't work for 2 levels of this, but the
13263      * code above has validated this new construct, so there should be no
13264      * errors generated by the below.  And this isn' an exact copy, so the
13265      * mechanism to seamlessly deal with this won't work, so turn off warnings
13266      * during it */
13267     save_start = RExC_start;
13268     orig_end = RExC_end;
13269
13270     RExC_parse = RExC_start = SvPVX(substitute_parse);
13271     RExC_end = RExC_parse + SvCUR(substitute_parse);
13272     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13273
13274     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13275
13276     /* Restore the saved values */
13277     RESTORE_WARNINGS;
13278     RExC_start = save_start;
13279     RExC_parse = endbrace;
13280     RExC_end = orig_end;
13281     SET_recode_x_to_native(0);
13282
13283     SvREFCNT_dec_NN(substitute_parse);
13284
13285     if (! *node_p) {
13286         RETURN_FAIL_ON_RESTART(flags, flagp);
13287         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13288             (UV) flags);
13289     }
13290     *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13291
13292     nextchar(pRExC_state);
13293
13294     return TRUE;
13295 }
13296
13297
13298 STATIC U8
13299 S_compute_EXACTish(RExC_state_t *pRExC_state)
13300 {
13301     U8 op;
13302
13303     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13304
13305     if (! FOLD) {
13306         return (LOC)
13307                 ? EXACTL
13308                 : EXACT;
13309     }
13310
13311     op = get_regex_charset(RExC_flags);
13312     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13313         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13314                  been, so there is no hole */
13315     }
13316
13317     return op + EXACTF;
13318 }
13319
13320 STATIC bool
13321 S_new_regcurly(const char *s, const char *e)
13322 {
13323     /* This is a temporary function designed to match the most lenient form of
13324      * a {m,n} quantifier we ever envision, with either number omitted, and
13325      * spaces anywhere between/before/after them.
13326      *
13327      * If this function fails, then the string it matches is very unlikely to
13328      * ever be considered a valid quantifier, so we can allow the '{' that
13329      * begins it to be considered as a literal */
13330
13331     bool has_min = FALSE;
13332     bool has_max = FALSE;
13333
13334     PERL_ARGS_ASSERT_NEW_REGCURLY;
13335
13336     if (s >= e || *s++ != '{')
13337         return FALSE;
13338
13339     while (s < e && isSPACE(*s)) {
13340         s++;
13341     }
13342     while (s < e && isDIGIT(*s)) {
13343         has_min = TRUE;
13344         s++;
13345     }
13346     while (s < e && isSPACE(*s)) {
13347         s++;
13348     }
13349
13350     if (*s == ',') {
13351         s++;
13352         while (s < e && isSPACE(*s)) {
13353             s++;
13354         }
13355         while (s < e && isDIGIT(*s)) {
13356             has_max = TRUE;
13357             s++;
13358         }
13359         while (s < e && isSPACE(*s)) {
13360             s++;
13361         }
13362     }
13363
13364     return s < e && *s == '}' && (has_min || has_max);
13365 }
13366
13367 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13368  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13369
13370 static I32
13371 S_backref_value(char *p, char *e)
13372 {
13373     const char* endptr = e;
13374     UV val;
13375     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13376         return (I32)val;
13377     return I32_MAX;
13378 }
13379
13380
13381 /*
13382  - regatom - the lowest level
13383
13384    Try to identify anything special at the start of the current parse position.
13385    If there is, then handle it as required. This may involve generating a
13386    single regop, such as for an assertion; or it may involve recursing, such as
13387    to handle a () structure.
13388
13389    If the string doesn't start with something special then we gobble up
13390    as much literal text as we can.  If we encounter a quantifier, we have to
13391    back off the final literal character, as that quantifier applies to just it
13392    and not to the whole string of literals.
13393
13394    Once we have been able to handle whatever type of thing started the
13395    sequence, we return the offset into the regex engine program being compiled
13396    at which any  next regnode should be placed.
13397
13398    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13399    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13400    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13401    Otherwise does not return 0.
13402
13403    Note: we have to be careful with escapes, as they can be both literal
13404    and special, and in the case of \10 and friends, context determines which.
13405
13406    A summary of the code structure is:
13407
13408    switch (first_byte) {
13409         cases for each special:
13410             handle this special;
13411             break;
13412         case '\\':
13413             switch (2nd byte) {
13414                 cases for each unambiguous special:
13415                     handle this special;
13416                     break;
13417                 cases for each ambigous special/literal:
13418                     disambiguate;
13419                     if (special)  handle here
13420                     else goto defchar;
13421                 default: // unambiguously literal:
13422                     goto defchar;
13423             }
13424         default:  // is a literal char
13425             // FALL THROUGH
13426         defchar:
13427             create EXACTish node for literal;
13428             while (more input and node isn't full) {
13429                 switch (input_byte) {
13430                    cases for each special;
13431                        make sure parse pointer is set so that the next call to
13432                            regatom will see this special first
13433                        goto loopdone; // EXACTish node terminated by prev. char
13434                    default:
13435                        append char to EXACTISH node;
13436                 }
13437                 get next input byte;
13438             }
13439         loopdone:
13440    }
13441    return the generated node;
13442
13443    Specifically there are two separate switches for handling
13444    escape sequences, with the one for handling literal escapes requiring
13445    a dummy entry for all of the special escapes that are actually handled
13446    by the other.
13447
13448 */
13449
13450 STATIC regnode_offset
13451 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13452 {
13453     regnode_offset ret = 0;
13454     I32 flags = 0;
13455     char *parse_start;
13456     U8 op;
13457     int invert = 0;
13458
13459     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13460
13461     *flagp = 0;         /* Initialize. */
13462
13463     DEBUG_PARSE("atom");
13464
13465     PERL_ARGS_ASSERT_REGATOM;
13466
13467   tryagain:
13468     parse_start = RExC_parse;
13469     assert(RExC_parse < RExC_end);
13470     switch ((U8)*RExC_parse) {
13471     case '^':
13472         RExC_seen_zerolen++;
13473         nextchar(pRExC_state);
13474         if (RExC_flags & RXf_PMf_MULTILINE)
13475             ret = reg_node(pRExC_state, MBOL);
13476         else
13477             ret = reg_node(pRExC_state, SBOL);
13478         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13479         break;
13480     case '$':
13481         nextchar(pRExC_state);
13482         if (*RExC_parse)
13483             RExC_seen_zerolen++;
13484         if (RExC_flags & RXf_PMf_MULTILINE)
13485             ret = reg_node(pRExC_state, MEOL);
13486         else
13487             ret = reg_node(pRExC_state, SEOL);
13488         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13489         break;
13490     case '.':
13491         nextchar(pRExC_state);
13492         if (RExC_flags & RXf_PMf_SINGLELINE)
13493             ret = reg_node(pRExC_state, SANY);
13494         else
13495             ret = reg_node(pRExC_state, REG_ANY);
13496         *flagp |= HASWIDTH|SIMPLE;
13497         MARK_NAUGHTY(1);
13498         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13499         break;
13500     case '[':
13501     {
13502         char * const oregcomp_parse = ++RExC_parse;
13503         ret = regclass(pRExC_state, flagp, depth+1,
13504                        FALSE, /* means parse the whole char class */
13505                        TRUE, /* allow multi-char folds */
13506                        FALSE, /* don't silence non-portable warnings. */
13507                        (bool) RExC_strict,
13508                        TRUE, /* Allow an optimized regnode result */
13509                        NULL);
13510         if (ret == 0) {
13511             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13512             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13513                   (UV) *flagp);
13514         }
13515         if (*RExC_parse != ']') {
13516             RExC_parse = oregcomp_parse;
13517             vFAIL("Unmatched [");
13518         }
13519         nextchar(pRExC_state);
13520         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13521         break;
13522     }
13523     case '(':
13524         nextchar(pRExC_state);
13525         ret = reg(pRExC_state, 2, &flags, depth+1);
13526         if (ret == 0) {
13527                 if (flags & TRYAGAIN) {
13528                     if (RExC_parse >= RExC_end) {
13529                          /* Make parent create an empty node if needed. */
13530                         *flagp |= TRYAGAIN;
13531                         return(0);
13532                     }
13533                     goto tryagain;
13534                 }
13535                 RETURN_FAIL_ON_RESTART(flags, flagp);
13536                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13537                                                                  (UV) flags);
13538         }
13539         *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13540         break;
13541     case '|':
13542     case ')':
13543         if (flags & TRYAGAIN) {
13544             *flagp |= TRYAGAIN;
13545             return 0;
13546         }
13547         vFAIL("Internal urp");
13548                                 /* Supposed to be caught earlier. */
13549         break;
13550     case '?':
13551     case '+':
13552     case '*':
13553         RExC_parse++;
13554         vFAIL("Quantifier follows nothing");
13555         break;
13556     case '\\':
13557         /* Special Escapes
13558
13559            This switch handles escape sequences that resolve to some kind
13560            of special regop and not to literal text. Escape sequences that
13561            resolve to literal text are handled below in the switch marked
13562            "Literal Escapes".
13563
13564            Every entry in this switch *must* have a corresponding entry
13565            in the literal escape switch. However, the opposite is not
13566            required, as the default for this switch is to jump to the
13567            literal text handling code.
13568         */
13569         RExC_parse++;
13570         switch ((U8)*RExC_parse) {
13571         /* Special Escapes */
13572         case 'A':
13573             RExC_seen_zerolen++;
13574             /* Under wildcards, this is changed to match \n; should be
13575              * invisible to the user, as they have to compile under /m */
13576             if (RExC_pm_flags & PMf_WILDCARD) {
13577                 ret = reg_node(pRExC_state, MBOL);
13578             }
13579             else {
13580                 ret = reg_node(pRExC_state, SBOL);
13581                 /* SBOL is shared with /^/ so we set the flags so we can tell
13582                  * /\A/ from /^/ in split. */
13583                 FLAGS(REGNODE_p(ret)) = 1;
13584                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13585             }
13586             goto finish_meta_pat;
13587         case 'G':
13588             if (RExC_pm_flags & PMf_WILDCARD) {
13589                 RExC_parse++;
13590                 /* diag_listed_as: Use of %s is not allowed in Unicode property
13591                    wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13592                  */
13593                 vFAIL("Use of '\\G' is not allowed in Unicode property"
13594                       " wildcard subpatterns");
13595             }
13596             ret = reg_node(pRExC_state, GPOS);
13597             RExC_seen |= REG_GPOS_SEEN;
13598             *flagp |= SIMPLE;
13599             goto finish_meta_pat;
13600         case 'K':
13601             if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13602                 RExC_seen_zerolen++;
13603                 ret = reg_node(pRExC_state, KEEPS);
13604                 *flagp |= SIMPLE;
13605                 /* XXX:dmq : disabling in-place substitution seems to
13606                  * be necessary here to avoid cases of memory corruption, as
13607                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13608                  */
13609                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13610                 goto finish_meta_pat;
13611             }
13612             else {
13613                 ++RExC_parse; /* advance past the 'K' */
13614                 vFAIL("\\K not permitted in lookahead/lookbehind");
13615             }
13616         case 'Z':
13617             if (RExC_pm_flags & PMf_WILDCARD) {
13618                 /* See comment under \A above */
13619                 ret = reg_node(pRExC_state, MEOL);
13620             }
13621             else {
13622                 ret = reg_node(pRExC_state, SEOL);
13623                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13624             }
13625             RExC_seen_zerolen++;                /* Do not optimize RE away */
13626             goto finish_meta_pat;
13627         case 'z':
13628             if (RExC_pm_flags & PMf_WILDCARD) {
13629                 /* See comment under \A above */
13630                 ret = reg_node(pRExC_state, MEOL);
13631             }
13632             else {
13633                 ret = reg_node(pRExC_state, EOS);
13634                 *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
13635             }
13636             RExC_seen_zerolen++;                /* Do not optimize RE away */
13637             goto finish_meta_pat;
13638         case 'C':
13639             vFAIL("\\C no longer supported");
13640         case 'X':
13641             ret = reg_node(pRExC_state, CLUMP);
13642             *flagp |= HASWIDTH;
13643             goto finish_meta_pat;
13644
13645         case 'B':
13646             invert = 1;
13647             /* FALLTHROUGH */
13648         case 'b':
13649           {
13650             U8 flags = 0;
13651             regex_charset charset = get_regex_charset(RExC_flags);
13652
13653             RExC_seen_zerolen++;
13654             RExC_seen |= REG_LOOKBEHIND_SEEN;
13655             op = BOUND + charset;
13656
13657             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13658                 flags = TRADITIONAL_BOUND;
13659                 if (op > BOUNDA) {  /* /aa is same as /a */
13660                     op = BOUNDA;
13661                 }
13662             }
13663             else {
13664                 STRLEN length;
13665                 char name = *RExC_parse;
13666                 char * endbrace = NULL;
13667                 RExC_parse += 2;
13668                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13669
13670                 if (! endbrace) {
13671                     vFAIL2("Missing right brace on \\%c{}", name);
13672                 }
13673                 /* XXX Need to decide whether to take spaces or not.  Should be
13674                  * consistent with \p{}, but that currently is SPACE, which
13675                  * means vertical too, which seems wrong
13676                  * while (isBLANK(*RExC_parse)) {
13677                     RExC_parse++;
13678                 }*/
13679                 if (endbrace == RExC_parse) {
13680                     RExC_parse++;  /* After the '}' */
13681                     vFAIL2("Empty \\%c{}", name);
13682                 }
13683                 length = endbrace - RExC_parse;
13684                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13685                     length--;
13686                 }*/
13687                 switch (*RExC_parse) {
13688                     case 'g':
13689                         if (    length != 1
13690                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13691                         {
13692                             goto bad_bound_type;
13693                         }
13694                         flags = GCB_BOUND;
13695                         break;
13696                     case 'l':
13697                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13698                             goto bad_bound_type;
13699                         }
13700                         flags = LB_BOUND;
13701                         break;
13702                     case 's':
13703                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13704                             goto bad_bound_type;
13705                         }
13706                         flags = SB_BOUND;
13707                         break;
13708                     case 'w':
13709                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13710                             goto bad_bound_type;
13711                         }
13712                         flags = WB_BOUND;
13713                         break;
13714                     default:
13715                       bad_bound_type:
13716                         RExC_parse = endbrace;
13717                         vFAIL2utf8f(
13718                             "'%" UTF8f "' is an unknown bound type",
13719                             UTF8fARG(UTF, length, endbrace - length));
13720                         NOT_REACHED; /*NOTREACHED*/
13721                 }
13722                 RExC_parse = endbrace;
13723                 REQUIRE_UNI_RULES(flagp, 0);
13724
13725                 if (op == BOUND) {
13726                     op = BOUNDU;
13727                 }
13728                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13729                     op = BOUNDU;
13730                     length += 4;
13731
13732                     /* Don't have to worry about UTF-8, in this message because
13733                      * to get here the contents of the \b must be ASCII */
13734                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13735                               "Using /u for '%.*s' instead of /%s",
13736                               (unsigned) length,
13737                               endbrace - length + 1,
13738                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13739                               ? ASCII_RESTRICT_PAT_MODS
13740                               : ASCII_MORE_RESTRICT_PAT_MODS);
13741                 }
13742             }
13743
13744             if (op == BOUND) {
13745                 RExC_seen_d_op = TRUE;
13746             }
13747             else if (op == BOUNDL) {
13748                 RExC_contains_locale = 1;
13749             }
13750
13751             if (invert) {
13752                 op += NBOUND - BOUND;
13753             }
13754
13755             ret = reg_node(pRExC_state, op);
13756             FLAGS(REGNODE_p(ret)) = flags;
13757
13758             *flagp |= SIMPLE;
13759
13760             goto finish_meta_pat;
13761           }
13762
13763         case 'R':
13764             ret = reg_node(pRExC_state, LNBREAK);
13765             *flagp |= HASWIDTH|SIMPLE;
13766             goto finish_meta_pat;
13767
13768         case 'd':
13769         case 'D':
13770         case 'h':
13771         case 'H':
13772         case 'p':
13773         case 'P':
13774         case 's':
13775         case 'S':
13776         case 'v':
13777         case 'V':
13778         case 'w':
13779         case 'W':
13780             /* These all have the same meaning inside [brackets], and it knows
13781              * how to do the best optimizations for them.  So, pretend we found
13782              * these within brackets, and let it do the work */
13783             RExC_parse--;
13784
13785             ret = regclass(pRExC_state, flagp, depth+1,
13786                            TRUE, /* means just parse this element */
13787                            FALSE, /* don't allow multi-char folds */
13788                            FALSE, /* don't silence non-portable warnings.  It
13789                                      would be a bug if these returned
13790                                      non-portables */
13791                            (bool) RExC_strict,
13792                            TRUE, /* Allow an optimized regnode result */
13793                            NULL);
13794             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13795             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13796              * multi-char folds are allowed.  */
13797             if (!ret)
13798                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13799                       (UV) *flagp);
13800
13801             RExC_parse--;   /* regclass() leaves this one too far ahead */
13802
13803           finish_meta_pat:
13804                    /* The escapes above that don't take a parameter can't be
13805                     * followed by a '{'.  But 'pX', 'p{foo}' and
13806                     * correspondingly 'P' can be */
13807             if (   RExC_parse - parse_start == 1
13808                 && UCHARAT(RExC_parse + 1) == '{'
13809                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13810             {
13811                 RExC_parse += 2;
13812                 vFAIL("Unescaped left brace in regex is illegal here");
13813             }
13814             Set_Node_Offset(REGNODE_p(ret), parse_start);
13815             Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13816             nextchar(pRExC_state);
13817             break;
13818         case 'N':
13819             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13820              * \N{...} evaluates to a sequence of more than one code points).
13821              * The function call below returns a regnode, which is our result.
13822              * The parameters cause it to fail if the \N{} evaluates to a
13823              * single code point; we handle those like any other literal.  The
13824              * reason that the multicharacter case is handled here and not as
13825              * part of the EXACtish code is because of quantifiers.  In
13826              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13827              * this way makes that Just Happen. dmq.
13828              * join_exact() will join this up with adjacent EXACTish nodes
13829              * later on, if appropriate. */
13830             ++RExC_parse;
13831             if (grok_bslash_N(pRExC_state,
13832                               &ret,     /* Want a regnode returned */
13833                               NULL,     /* Fail if evaluates to a single code
13834                                            point */
13835                               NULL,     /* Don't need a count of how many code
13836                                            points */
13837                               flagp,
13838                               RExC_strict,
13839                               depth)
13840             ) {
13841                 break;
13842             }
13843
13844             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13845
13846             /* Here, evaluates to a single code point.  Go get that */
13847             RExC_parse = parse_start;
13848             goto defchar;
13849
13850         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13851       parse_named_seq:
13852         {
13853             char ch;
13854             if (   RExC_parse >= RExC_end - 1
13855                 || ((   ch = RExC_parse[1]) != '<'
13856                                       && ch != '\''
13857                                       && ch != '{'))
13858             {
13859                 RExC_parse++;
13860                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13861                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13862             } else {
13863                 RExC_parse += 2;
13864                 ret = handle_named_backref(pRExC_state,
13865                                            flagp,
13866                                            parse_start,
13867                                            (ch == '<')
13868                                            ? '>'
13869                                            : (ch == '{')
13870                                              ? '}'
13871                                              : '\'');
13872             }
13873             break;
13874         }
13875         case 'g':
13876         case '1': case '2': case '3': case '4':
13877         case '5': case '6': case '7': case '8': case '9':
13878             {
13879                 I32 num;
13880                 bool hasbrace = 0;
13881
13882                 if (*RExC_parse == 'g') {
13883                     bool isrel = 0;
13884
13885                     RExC_parse++;
13886                     if (*RExC_parse == '{') {
13887                         RExC_parse++;
13888                         hasbrace = 1;
13889                     }
13890                     if (*RExC_parse == '-') {
13891                         RExC_parse++;
13892                         isrel = 1;
13893                     }
13894                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13895                         if (isrel) RExC_parse--;
13896                         RExC_parse -= 2;
13897                         goto parse_named_seq;
13898                     }
13899
13900                     if (RExC_parse >= RExC_end) {
13901                         goto unterminated_g;
13902                     }
13903                     num = S_backref_value(RExC_parse, RExC_end);
13904                     if (num == 0)
13905                         vFAIL("Reference to invalid group 0");
13906                     else if (num == I32_MAX) {
13907                          if (isDIGIT(*RExC_parse))
13908                             vFAIL("Reference to nonexistent group");
13909                         else
13910                           unterminated_g:
13911                             vFAIL("Unterminated \\g... pattern");
13912                     }
13913
13914                     if (isrel) {
13915                         num = RExC_npar - num;
13916                         if (num < 1)
13917                             vFAIL("Reference to nonexistent or unclosed group");
13918                     }
13919                 }
13920                 else {
13921                     num = S_backref_value(RExC_parse, RExC_end);
13922                     /* bare \NNN might be backref or octal - if it is larger
13923                      * than or equal RExC_npar then it is assumed to be an
13924                      * octal escape. Note RExC_npar is +1 from the actual
13925                      * number of parens. */
13926                     /* Note we do NOT check if num == I32_MAX here, as that is
13927                      * handled by the RExC_npar check */
13928
13929                     if (
13930                         /* any numeric escape < 10 is always a backref */
13931                         num > 9
13932                         /* any numeric escape < RExC_npar is a backref */
13933                         && num >= RExC_npar
13934                         /* cannot be an octal escape if it starts with [89] */
13935                         && ! inRANGE(*RExC_parse, '8', '9')
13936                     ) {
13937                         /* Probably not meant to be a backref, instead likely
13938                          * to be an octal character escape, e.g. \35 or \777.
13939                          * The above logic should make it obvious why using
13940                          * octal escapes in patterns is problematic. - Yves */
13941                         RExC_parse = parse_start;
13942                         goto defchar;
13943                     }
13944                 }
13945
13946                 /* At this point RExC_parse points at a numeric escape like
13947                  * \12 or \88 or something similar, which we should NOT treat
13948                  * as an octal escape. It may or may not be a valid backref
13949                  * escape. For instance \88888888 is unlikely to be a valid
13950                  * backref. */
13951                 while (isDIGIT(*RExC_parse))
13952                     RExC_parse++;
13953                 if (hasbrace) {
13954                     if (*RExC_parse != '}')
13955                         vFAIL("Unterminated \\g{...} pattern");
13956                     RExC_parse++;
13957                 }
13958                 if (num >= (I32)RExC_npar) {
13959
13960                     /* It might be a forward reference; we can't fail until we
13961                      * know, by completing the parse to get all the groups, and
13962                      * then reparsing */
13963                     if (ALL_PARENS_COUNTED)  {
13964                         if (num >= RExC_total_parens)  {
13965                             vFAIL("Reference to nonexistent group");
13966                         }
13967                     }
13968                     else {
13969                         REQUIRE_PARENS_PASS;
13970                     }
13971                 }
13972                 RExC_sawback = 1;
13973                 ret = reganode(pRExC_state,
13974                                ((! FOLD)
13975                                  ? REF
13976                                  : (ASCII_FOLD_RESTRICTED)
13977                                    ? REFFA
13978                                    : (AT_LEAST_UNI_SEMANTICS)
13979                                      ? REFFU
13980                                      : (LOC)
13981                                        ? REFFL
13982                                        : REFF),
13983                                 num);
13984                 if (OP(REGNODE_p(ret)) == REFF) {
13985                     RExC_seen_d_op = TRUE;
13986                 }
13987                 *flagp |= HASWIDTH;
13988
13989                 /* override incorrect value set in reganode MJD */
13990                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13991                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13992                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13993                                         FALSE /* Don't force to /x */ );
13994             }
13995             break;
13996         case '\0':
13997             if (RExC_parse >= RExC_end)
13998                 FAIL("Trailing \\");
13999             /* FALLTHROUGH */
14000         default:
14001             /* Do not generate "unrecognized" warnings here, we fall
14002                back into the quick-grab loop below */
14003             RExC_parse = parse_start;
14004             goto defchar;
14005         } /* end of switch on a \foo sequence */
14006         break;
14007
14008     case '#':
14009
14010         /* '#' comments should have been spaced over before this function was
14011          * called */
14012         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14013         /*
14014         if (RExC_flags & RXf_PMf_EXTENDED) {
14015             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14016             if (RExC_parse < RExC_end)
14017                 goto tryagain;
14018         }
14019         */
14020
14021         /* FALLTHROUGH */
14022
14023     default:
14024           defchar: {
14025
14026             /* Here, we have determined that the next thing is probably a
14027              * literal character.  RExC_parse points to the first byte of its
14028              * definition.  (It still may be an escape sequence that evaluates
14029              * to a single character) */
14030
14031             STRLEN len = 0;
14032             UV ender = 0;
14033             char *p;
14034             char *s, *old_s = NULL, *old_old_s = NULL;
14035             char *s0;
14036             U32 max_string_len = 255;
14037
14038             /* We may have to reparse the node, artificially stopping filling
14039              * it early, based on info gleaned in the first parse.  This
14040              * variable gives where we stop.  Make it above the normal stopping
14041              * place first time through; otherwise it would stop too early */
14042             U32 upper_fill = max_string_len + 1;
14043
14044             /* We start out as an EXACT node, even if under /i, until we find a
14045              * character which is in a fold.  The algorithm now segregates into
14046              * separate nodes, characters that fold from those that don't under
14047              * /i.  (This hopefully will create nodes that are fixed strings
14048              * even under /i, giving the optimizer something to grab on to.)
14049              * So, if a node has something in it and the next character is in
14050              * the opposite category, that node is closed up, and the function
14051              * returns.  Then regatom is called again, and a new node is
14052              * created for the new category. */
14053             U8 node_type = EXACT;
14054
14055             /* Assume the node will be fully used; the excess is given back at
14056              * the end.  Under /i, we may need to temporarily add the fold of
14057              * an extra character or two at the end to check for splitting
14058              * multi-char folds, so allocate extra space for that.   We can't
14059              * make any other length assumptions, as a byte input sequence
14060              * could shrink down. */
14061             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14062                                                  + ((! FOLD)
14063                                                     ? 0
14064                                                     : 2 * ((UTF)
14065                                                            ? UTF8_MAXBYTES_CASE
14066                         /* Max non-UTF-8 expansion is 2 */ : 2)));
14067
14068             bool next_is_quantifier;
14069             char * oldp = NULL;
14070
14071             /* We can convert EXACTF nodes to EXACTFU if they contain only
14072              * characters that match identically regardless of the target
14073              * string's UTF8ness.  The reason to do this is that EXACTF is not
14074              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14075              * runtime.
14076              *
14077              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14078              * contain only above-Latin1 characters (hence must be in UTF8),
14079              * which don't participate in folds with Latin1-range characters,
14080              * as the latter's folds aren't known until runtime. */
14081             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14082
14083             /* Single-character EXACTish nodes are almost always SIMPLE.  This
14084              * allows us to override this as encountered */
14085             U8 maybe_SIMPLE = SIMPLE;
14086
14087             /* Does this node contain something that can't match unless the
14088              * target string is (also) in UTF-8 */
14089             bool requires_utf8_target = FALSE;
14090
14091             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14092             bool has_ss = FALSE;
14093
14094             /* So is the MICRO SIGN */
14095             bool has_micro_sign = FALSE;
14096
14097             /* Set when we fill up the current node and there is still more
14098              * text to process */
14099             bool overflowed;
14100
14101             /* Allocate an EXACT node.  The node_type may change below to
14102              * another EXACTish node, but since the size of the node doesn't
14103              * change, it works */
14104             ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14105                                                                     "exact");
14106             FILL_NODE(ret, node_type);
14107             RExC_emit++;
14108
14109             s = STRING(REGNODE_p(ret));
14110
14111             s0 = s;
14112
14113           reparse:
14114
14115             p = RExC_parse;
14116             len = 0;
14117             s = s0;
14118             node_type = EXACT;
14119             oldp = NULL;
14120             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14121             maybe_SIMPLE = SIMPLE;
14122             requires_utf8_target = FALSE;
14123             has_ss = FALSE;
14124             has_micro_sign = FALSE;
14125
14126           continue_parse:
14127
14128             /* This breaks under rare circumstances.  If folding, we do not
14129              * want to split a node at a character that is a non-final in a
14130              * multi-char fold, as an input string could just happen to want to
14131              * match across the node boundary.  The code at the end of the loop
14132              * looks for this, and backs off until it finds not such a
14133              * character, but it is possible (though extremely, extremely
14134              * unlikely) for all characters in the node to be non-final fold
14135              * ones, in which case we just leave the node fully filled, and
14136              * hope that it doesn't match the string in just the wrong place */
14137
14138             assert( ! UTF     /* Is at the beginning of a character */
14139                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14140                    || UTF8_IS_START(UCHARAT(RExC_parse)));
14141
14142             overflowed = FALSE;
14143
14144             /* Here, we have a literal character.  Find the maximal string of
14145              * them in the input that we can fit into a single EXACTish node.
14146              * We quit at the first non-literal or when the node gets full, or
14147              * under /i the categorization of folding/non-folding character
14148              * changes */
14149             while (p < RExC_end && len < upper_fill) {
14150
14151                 /* In most cases each iteration adds one byte to the output.
14152                  * The exceptions override this */
14153                 Size_t added_len = 1;
14154
14155                 oldp = p;
14156                 old_old_s = old_s;
14157                 old_s = s;
14158
14159                 /* White space has already been ignored */
14160                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
14161                        || ! is_PATWS_safe((p), RExC_end, UTF));
14162
14163                 switch ((U8)*p) {
14164                   const char* message;
14165                   U32 packed_warn;
14166                   U8 grok_c_char;
14167
14168                 case '^':
14169                 case '$':
14170                 case '.':
14171                 case '[':
14172                 case '(':
14173                 case ')':
14174                 case '|':
14175                     goto loopdone;
14176                 case '\\':
14177                     /* Literal Escapes Switch
14178
14179                        This switch is meant to handle escape sequences that
14180                        resolve to a literal character.
14181
14182                        Every escape sequence that represents something
14183                        else, like an assertion or a char class, is handled
14184                        in the switch marked 'Special Escapes' above in this
14185                        routine, but also has an entry here as anything that
14186                        isn't explicitly mentioned here will be treated as
14187                        an unescaped equivalent literal.
14188                     */
14189
14190                     switch ((U8)*++p) {
14191
14192                     /* These are all the special escapes. */
14193                     case 'A':             /* Start assertion */
14194                     case 'b': case 'B':   /* Word-boundary assertion*/
14195                     case 'C':             /* Single char !DANGEROUS! */
14196                     case 'd': case 'D':   /* digit class */
14197                     case 'g': case 'G':   /* generic-backref, pos assertion */
14198                     case 'h': case 'H':   /* HORIZWS */
14199                     case 'k': case 'K':   /* named backref, keep marker */
14200                     case 'p': case 'P':   /* Unicode property */
14201                               case 'R':   /* LNBREAK */
14202                     case 's': case 'S':   /* space class */
14203                     case 'v': case 'V':   /* VERTWS */
14204                     case 'w': case 'W':   /* word class */
14205                     case 'X':             /* eXtended Unicode "combining
14206                                              character sequence" */
14207                     case 'z': case 'Z':   /* End of line/string assertion */
14208                         --p;
14209                         goto loopdone;
14210
14211                     /* Anything after here is an escape that resolves to a
14212                        literal. (Except digits, which may or may not)
14213                      */
14214                     case 'n':
14215                         ender = '\n';
14216                         p++;
14217                         break;
14218                     case 'N': /* Handle a single-code point named character. */
14219                         RExC_parse = p + 1;
14220                         if (! grok_bslash_N(pRExC_state,
14221                                             NULL,   /* Fail if evaluates to
14222                                                        anything other than a
14223                                                        single code point */
14224                                             &ender, /* The returned single code
14225                                                        point */
14226                                             NULL,   /* Don't need a count of
14227                                                        how many code points */
14228                                             flagp,
14229                                             RExC_strict,
14230                                             depth)
14231                         ) {
14232                             if (*flagp & NEED_UTF8)
14233                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14234                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14235
14236                             /* Here, it wasn't a single code point.  Go close
14237                              * up this EXACTish node.  The switch() prior to
14238                              * this switch handles the other cases */
14239                             RExC_parse = p = oldp;
14240                             goto loopdone;
14241                         }
14242                         p = RExC_parse;
14243                         RExC_parse = parse_start;
14244
14245                         /* The \N{} means the pattern, if previously /d,
14246                          * becomes /u.  That means it can't be an EXACTF node,
14247                          * but an EXACTFU */
14248                         if (node_type == EXACTF) {
14249                             node_type = EXACTFU;
14250
14251                             /* If the node already contains something that
14252                              * differs between EXACTF and EXACTFU, reparse it
14253                              * as EXACTFU */
14254                             if (! maybe_exactfu) {
14255                                 len = 0;
14256                                 s = s0;
14257                                 goto reparse;
14258                             }
14259                         }
14260
14261                         break;
14262                     case 'r':
14263                         ender = '\r';
14264                         p++;
14265                         break;
14266                     case 't':
14267                         ender = '\t';
14268                         p++;
14269                         break;
14270                     case 'f':
14271                         ender = '\f';
14272                         p++;
14273                         break;
14274                     case 'e':
14275                         ender = ESC_NATIVE;
14276                         p++;
14277                         break;
14278                     case 'a':
14279                         ender = '\a';
14280                         p++;
14281                         break;
14282                     case 'o':
14283                         if (! grok_bslash_o(&p,
14284                                             RExC_end,
14285                                             &ender,
14286                                             &message,
14287                                             &packed_warn,
14288                                             (bool) RExC_strict,
14289                                             FALSE, /* No illegal cp's */
14290                                             UTF))
14291                         {
14292                             RExC_parse = p; /* going to die anyway; point to
14293                                                exact spot of failure */
14294                             vFAIL(message);
14295                         }
14296
14297                         if (message && TO_OUTPUT_WARNINGS(p)) {
14298                             warn_non_literal_string(p, packed_warn, message);
14299                         }
14300                         break;
14301                     case 'x':
14302                         if (! grok_bslash_x(&p,
14303                                             RExC_end,
14304                                             &ender,
14305                                             &message,
14306                                             &packed_warn,
14307                                             (bool) RExC_strict,
14308                                             FALSE, /* No illegal cp's */
14309                                             UTF))
14310                         {
14311                             RExC_parse = p;     /* going to die anyway; point
14312                                                    to exact spot of failure */
14313                             vFAIL(message);
14314                         }
14315
14316                         if (message && TO_OUTPUT_WARNINGS(p)) {
14317                             warn_non_literal_string(p, packed_warn, message);
14318                         }
14319
14320 #ifdef EBCDIC
14321                         if (ender < 0x100) {
14322                             if (RExC_recode_x_to_native) {
14323                                 ender = LATIN1_TO_NATIVE(ender);
14324                             }
14325                         }
14326 #endif
14327                         break;
14328                     case 'c':
14329                         p++;
14330                         if (! grok_bslash_c(*p, &grok_c_char,
14331                                             &message, &packed_warn))
14332                         {
14333                             /* going to die anyway; point to exact spot of
14334                              * failure */
14335                             RExC_parse = p + ((UTF)
14336                                               ? UTF8_SAFE_SKIP(p, RExC_end)
14337                                               : 1);
14338                             vFAIL(message);
14339                         }
14340
14341                         ender = grok_c_char;
14342                         p++;
14343                         if (message && TO_OUTPUT_WARNINGS(p)) {
14344                             warn_non_literal_string(p, packed_warn, message);
14345                         }
14346
14347                         break;
14348                     case '8': case '9': /* must be a backreference */
14349                         --p;
14350                         /* we have an escape like \8 which cannot be an octal escape
14351                          * so we exit the loop, and let the outer loop handle this
14352                          * escape which may or may not be a legitimate backref. */
14353                         goto loopdone;
14354                     case '1': case '2': case '3':case '4':
14355                     case '5': case '6': case '7':
14356                         /* When we parse backslash escapes there is ambiguity
14357                          * between backreferences and octal escapes. Any escape
14358                          * from \1 - \9 is a backreference, any multi-digit
14359                          * escape which does not start with 0 and which when
14360                          * evaluated as decimal could refer to an already
14361                          * parsed capture buffer is a back reference. Anything
14362                          * else is octal.
14363                          *
14364                          * Note this implies that \118 could be interpreted as
14365                          * 118 OR as "\11" . "8" depending on whether there
14366                          * were 118 capture buffers defined already in the
14367                          * pattern.  */
14368
14369                         /* NOTE, RExC_npar is 1 more than the actual number of
14370                          * parens we have seen so far, hence the "<" as opposed
14371                          * to "<=" */
14372                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14373                         {  /* Not to be treated as an octal constant, go
14374                                    find backref */
14375                             --p;
14376                             goto loopdone;
14377                         }
14378                         /* FALLTHROUGH */
14379                     case '0':
14380                         {
14381                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14382                                       | PERL_SCAN_NOTIFY_ILLDIGIT;
14383                             STRLEN numlen = 3;
14384                             ender = grok_oct(p, &numlen, &flags, NULL);
14385                             p += numlen;
14386                             if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14387                                 && isDIGIT(*p)  /* like \08, \178 */
14388                                 && ckWARN(WARN_REGEXP))
14389                             {
14390                                 reg_warn_non_literal_string(
14391                                      p + 1,
14392                                      form_alien_digit_msg(8, numlen, p,
14393                                                         RExC_end, UTF, FALSE));
14394                             }
14395                         }
14396                         break;
14397                     case '\0':
14398                         if (p >= RExC_end)
14399                             FAIL("Trailing \\");
14400                         /* FALLTHROUGH */
14401                     default:
14402                         if (isALPHANUMERIC(*p)) {
14403                             /* An alpha followed by '{' is going to fail next
14404                              * iteration, so don't output this warning in that
14405                              * case */
14406                             if (! isALPHA(*p) || *(p + 1) != '{') {
14407                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14408                                                   " passed through", p);
14409                             }
14410                         }
14411                         goto normal_default;
14412                     } /* End of switch on '\' */
14413                     break;
14414                 case '{':
14415                     /* Trying to gain new uses for '{' without breaking too
14416                      * much existing code is hard.  The solution currently
14417                      * adopted is:
14418                      *  1)  If there is no ambiguity that a '{' should always
14419                      *      be taken literally, at the start of a construct, we
14420                      *      just do so.
14421                      *  2)  If the literal '{' conflicts with our desired use
14422                      *      of it as a metacharacter, we die.  The deprecation
14423                      *      cycles for this have come and gone.
14424                      *  3)  If there is ambiguity, we raise a simple warning.
14425                      *      This could happen, for example, if the user
14426                      *      intended it to introduce a quantifier, but slightly
14427                      *      misspelled the quantifier.  Without this warning,
14428                      *      the quantifier would silently be taken as a literal
14429                      *      string of characters instead of a meta construct */
14430                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14431                         if (      RExC_strict
14432                             || (  p > parse_start + 1
14433                                 && isALPHA_A(*(p - 1))
14434                                 && *(p - 2) == '\\')
14435                             || new_regcurly(p, RExC_end))
14436                         {
14437                             RExC_parse = p + 1;
14438                             vFAIL("Unescaped left brace in regex is "
14439                                   "illegal here");
14440                         }
14441                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14442                                          " passed through");
14443                     }
14444                     goto normal_default;
14445                 case '}':
14446                 case ']':
14447                     if (p > RExC_parse && RExC_strict) {
14448                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14449                     }
14450                     /*FALLTHROUGH*/
14451                 default:    /* A literal character */
14452                   normal_default:
14453                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14454                         STRLEN numlen;
14455                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14456                                                &numlen, UTF8_ALLOW_DEFAULT);
14457                         p += numlen;
14458                     }
14459                     else
14460                         ender = (U8) *p++;
14461                     break;
14462                 } /* End of switch on the literal */
14463
14464                 /* Here, have looked at the literal character, and <ender>
14465                  * contains its ordinal; <p> points to the character after it.
14466                  * */
14467
14468                 if (ender > 255) {
14469                     REQUIRE_UTF8(flagp);
14470                     if (   UNICODE_IS_PERL_EXTENDED(ender)
14471                         && TO_OUTPUT_WARNINGS(p))
14472                     {
14473                         ckWARN2_non_literal_string(p,
14474                                                    packWARN(WARN_PORTABLE),
14475                                                    PL_extended_cp_format,
14476                                                    ender);
14477                     }
14478                 }
14479
14480                 /* We need to check if the next non-ignored thing is a
14481                  * quantifier.  Move <p> to after anything that should be
14482                  * ignored, which, as a side effect, positions <p> for the next
14483                  * loop iteration */
14484                 skip_to_be_ignored_text(pRExC_state, &p,
14485                                         FALSE /* Don't force to /x */ );
14486
14487                 /* If the next thing is a quantifier, it applies to this
14488                  * character only, which means that this character has to be in
14489                  * its own node and can't just be appended to the string in an
14490                  * existing node, so if there are already other characters in
14491                  * the node, close the node with just them, and set up to do
14492                  * this character again next time through, when it will be the
14493                  * only thing in its new node */
14494
14495                 next_is_quantifier =    LIKELY(p < RExC_end)
14496                                      && UNLIKELY(ISMULT2(p));
14497
14498                 if (next_is_quantifier && LIKELY(len)) {
14499                     p = oldp;
14500                     goto loopdone;
14501                 }
14502
14503                 /* Ready to add 'ender' to the node */
14504
14505                 if (! FOLD) {  /* The simple case, just append the literal */
14506                   not_fold_common:
14507
14508                     /* Don't output if it would overflow */
14509                     if (UNLIKELY(len > max_string_len - ((UTF)
14510                                                       ? UVCHR_SKIP(ender)
14511                                                       : 1)))
14512                     {
14513                         overflowed = TRUE;
14514                         break;
14515                     }
14516
14517                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14518                         *(s++) = (char) ender;
14519                     }
14520                     else {
14521                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14522                         added_len = (char *) new_s - s;
14523                         s = (char *) new_s;
14524
14525                         if (ender > 255)  {
14526                             requires_utf8_target = TRUE;
14527                         }
14528                     }
14529                 }
14530                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14531
14532                     /* Here are folding under /l, and the code point is
14533                      * problematic.  If this is the first character in the
14534                      * node, change the node type to folding.   Otherwise, if
14535                      * this is the first problematic character, close up the
14536                      * existing node, so can start a new node with this one */
14537                     if (! len) {
14538                         node_type = EXACTFL;
14539                         RExC_contains_locale = 1;
14540                     }
14541                     else if (node_type == EXACT) {
14542                         p = oldp;
14543                         goto loopdone;
14544                     }
14545
14546                     /* This problematic code point means we can't simplify
14547                      * things */
14548                     maybe_exactfu = FALSE;
14549
14550                     /* Here, we are adding a problematic fold character.
14551                      * "Problematic" in this context means that its fold isn't
14552                      * known until runtime.  (The non-problematic code points
14553                      * are the above-Latin1 ones that fold to also all
14554                      * above-Latin1.  Their folds don't vary no matter what the
14555                      * locale is.) But here we have characters whose fold
14556                      * depends on the locale.  We just add in the unfolded
14557                      * character, and wait until runtime to fold it */
14558                     goto not_fold_common;
14559                 }
14560                 else /* regular fold; see if actually is in a fold */
14561                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14562                          || (ender > 255
14563                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14564                 {
14565                     /* Here, folding, but the character isn't in a fold.
14566                      *
14567                      * Start a new node if previous characters in the node were
14568                      * folded */
14569                     if (len && node_type != EXACT) {
14570                         p = oldp;
14571                         goto loopdone;
14572                     }
14573
14574                     /* Here, continuing a node with non-folded characters.  Add
14575                      * this one */
14576                     goto not_fold_common;
14577                 }
14578                 else {  /* Here, does participate in some fold */
14579
14580                     /* If this is the first character in the node, change its
14581                      * type to folding.  Otherwise, if this is the first
14582                      * folding character in the node, close up the existing
14583                      * node, so can start a new node with this one.  */
14584                     if (! len) {
14585                         node_type = compute_EXACTish(pRExC_state);
14586                     }
14587                     else if (node_type == EXACT) {
14588                         p = oldp;
14589                         goto loopdone;
14590                     }
14591
14592                     if (UTF) {  /* Alway use the folded value for UTF-8
14593                                    patterns */
14594                         if (UVCHR_IS_INVARIANT(ender)) {
14595                             if (UNLIKELY(len + 1 > max_string_len)) {
14596                                 overflowed = TRUE;
14597                                 break;
14598                             }
14599
14600                             *(s)++ = (U8) toFOLD(ender);
14601                         }
14602                         else {
14603                             UV folded = _to_uni_fold_flags(
14604                                     ender,
14605                                     (U8 *) s,  /* We have allocated extra space
14606                                                   in 's' so can't run off the
14607                                                   end */
14608                                     &added_len,
14609                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14610                                                     ? FOLD_FLAGS_NOMIX_ASCII
14611                                                     : 0));
14612                             if (UNLIKELY(len + added_len > max_string_len)) {
14613                                 overflowed = TRUE;
14614                                 break;
14615                             }
14616
14617                             s += added_len;
14618
14619                             if (   folded > 255
14620                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14621                             {
14622                                 /* U+B5 folds to the MU, so its possible for a
14623                                  * non-UTF-8 target to match it */
14624                                 requires_utf8_target = TRUE;
14625                             }
14626                         }
14627                     }
14628                     else { /* Here is non-UTF8. */
14629
14630                         /* The fold will be one or (rarely) two characters.
14631                          * Check that there's room for at least a single one
14632                          * before setting any flags, etc.  Because otherwise an
14633                          * overflowing character could cause a flag to be set
14634                          * even though it doesn't end up in this node.  (For
14635                          * the two character fold, we check again, before
14636                          * setting any flags) */
14637                         if (UNLIKELY(len + 1 > max_string_len)) {
14638                             overflowed = TRUE;
14639                             break;
14640                         }
14641
14642 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14643    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14644                                       || UNICODE_DOT_DOT_VERSION > 0)
14645
14646                         /* On non-ancient Unicodes, check for the only possible
14647                          * multi-char fold  */
14648                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14649
14650                             /* This potential multi-char fold means the node
14651                              * can't be simple (because it could match more
14652                              * than a single char).  And in some cases it will
14653                              * match 'ss', so set that flag */
14654                             maybe_SIMPLE = 0;
14655                             has_ss = TRUE;
14656
14657                             /* It can't change to be an EXACTFU (unless already
14658                              * is one).  We fold it iff under /u rules. */
14659                             if (node_type != EXACTFU) {
14660                                 maybe_exactfu = FALSE;
14661                             }
14662                             else {
14663                                 if (UNLIKELY(len + 2 > max_string_len)) {
14664                                     overflowed = TRUE;
14665                                     break;
14666                                 }
14667
14668                                 *(s++) = 's';
14669                                 *(s++) = 's';
14670                                 added_len = 2;
14671
14672                                 goto done_with_this_char;
14673                             }
14674                         }
14675                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14676                                  && LIKELY(len > 0)
14677                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14678                         {
14679                             /* Also, the sequence 'ss' is special when not
14680                              * under /u.  If the target string is UTF-8, it
14681                              * should match SHARP S; otherwise it won't.  So,
14682                              * here we have to exclude the possibility of this
14683                              * node moving to /u.*/
14684                             has_ss = TRUE;
14685                             maybe_exactfu = FALSE;
14686                         }
14687 #endif
14688                         /* Here, the fold will be a single character */
14689
14690                         if (UNLIKELY(ender == MICRO_SIGN)) {
14691                             has_micro_sign = TRUE;
14692                         }
14693                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14694
14695                             /* If the character's fold differs between /d and
14696                              * /u, this can't change to be an EXACTFU node */
14697                             maybe_exactfu = FALSE;
14698                         }
14699
14700                         *(s++) = (DEPENDS_SEMANTICS)
14701                                  ? (char) toFOLD(ender)
14702
14703                                    /* Under /u, the fold of any character in
14704                                     * the 0-255 range happens to be its
14705                                     * lowercase equivalent, except for LATIN
14706                                     * SMALL LETTER SHARP S, which was handled
14707                                     * above, and the MICRO SIGN, whose fold
14708                                     * requires UTF-8 to represent.  */
14709                                  : (char) toLOWER_L1(ender);
14710                     }
14711                 } /* End of adding current character to the node */
14712
14713               done_with_this_char:
14714
14715                 len += added_len;
14716
14717                 if (next_is_quantifier) {
14718
14719                     /* Here, the next input is a quantifier, and to get here,
14720                      * the current character is the only one in the node. */
14721                     goto loopdone;
14722                 }
14723
14724             } /* End of loop through literal characters */
14725
14726             /* Here we have either exhausted the input or run out of room in
14727              * the node.  If the former, we are done.  (If we encountered a
14728              * character that can't be in the node, transfer is made directly
14729              * to <loopdone>, and so we wouldn't have fallen off the end of the
14730              * loop.)  */
14731             if (LIKELY(! overflowed)) {
14732                 goto loopdone;
14733             }
14734
14735             /* Here we have run out of room.  We can grow plain EXACT and
14736              * LEXACT nodes.  If the pattern is gigantic enough, though,
14737              * eventually we'll have to artificially chunk the pattern into
14738              * multiple nodes. */
14739             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14740                 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14741                 Size_t overhead_expansion = 0;
14742                 char temp[256];
14743                 Size_t max_nodes_for_string;
14744                 Size_t achievable;
14745                 SSize_t delta;
14746
14747                 /* Here we couldn't fit the final character in the current
14748                  * node, so it will have to be reparsed, no matter what else we
14749                  * do */
14750                 p = oldp;
14751
14752                 /* If would have overflowed a regular EXACT node, switch
14753                  * instead to an LEXACT.  The code below is structured so that
14754                  * the actual growing code is common to changing from an EXACT
14755                  * or just increasing the LEXACT size.  This means that we have
14756                  * to save the string in the EXACT case before growing, and
14757                  * then copy it afterwards to its new location */
14758                 if (node_type == EXACT) {
14759                     overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14760                     RExC_emit += overhead_expansion;
14761                     Copy(s0, temp, len, char);
14762                 }
14763
14764                 /* Ready to grow.  If it was a plain EXACT, the string was
14765                  * saved, and the first few bytes of it overwritten by adding
14766                  * an argument field.  We assume, as we do elsewhere in this
14767                  * file, that one byte of remaining input will translate into
14768                  * one byte of output, and if that's too small, we grow again,
14769                  * if too large the excess memory is freed at the end */
14770
14771                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14772                 achievable = MIN(max_nodes_for_string,
14773                                  current_string_nodes + STR_SZ(RExC_end - p));
14774                 delta = achievable - current_string_nodes;
14775
14776                 /* If there is just no more room, go finish up this chunk of
14777                  * the pattern. */
14778                 if (delta <= 0) {
14779                     goto loopdone;
14780                 }
14781
14782                 change_engine_size(pRExC_state, delta + overhead_expansion);
14783                 current_string_nodes += delta;
14784                 max_string_len
14785                            = sizeof(struct regnode) * current_string_nodes;
14786                 upper_fill = max_string_len + 1;
14787
14788                 /* If the length was small, we know this was originally an
14789                  * EXACT node now converted to LEXACT, and the string has to be
14790                  * restored.  Otherwise the string was untouched.  260 is just
14791                  * a number safely above 255 so don't have to worry about
14792                  * getting it precise */
14793                 if (len < 260) {
14794                     node_type = LEXACT;
14795                     FILL_NODE(ret, node_type);
14796                     s0 = STRING(REGNODE_p(ret));
14797                     Copy(temp, s0, len, char);
14798                     s = s0 + len;
14799                 }
14800
14801                 goto continue_parse;
14802             }
14803             else if (FOLD) {
14804                 bool splittable = FALSE;
14805                 bool backed_up = FALSE;
14806                 char * e;       /* should this be U8? */
14807                 char * s_start; /* should this be U8? */
14808
14809                 /* Here is /i.  Running out of room creates a problem if we are
14810                  * folding, and the split happens in the middle of a
14811                  * multi-character fold, as a match that should have occurred,
14812                  * won't, due to the way nodes are matched, and our artificial
14813                  * boundary.  So back off until we aren't splitting such a
14814                  * fold.  If there is no such place to back off to, we end up
14815                  * taking the entire node as-is.  This can happen if the node
14816                  * consists entirely of 'f' or entirely of 's' characters (or
14817                  * things that fold to them) as 'ff' and 'ss' are
14818                  * multi-character folds.
14819                  *
14820                  * The Unicode standard says that multi character folds consist
14821                  * of either two or three characters.  That means we would be
14822                  * splitting one if the final character in the node is at the
14823                  * beginning of either type, or is the second of a three
14824                  * character fold.
14825                  *
14826                  * At this point:
14827                  *  ender     is the code point of the character that won't fit
14828                  *            in the node
14829                  *  s         points to just beyond the final byte in the node.
14830                  *            It's where we would place ender if there were
14831                  *            room, and where in fact we do place ender's fold
14832                  *            in the code below, as we've over-allocated space
14833                  *            for s0 (hence s) to allow for this
14834                  *  e         starts at 's' and advances as we append things.
14835                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
14836                  *            have been advanced to beyond it).
14837                  *  old_old_s points to the beginning byte of the final
14838                  *            character in the node
14839                  *  p         points to the beginning byte in the input of the
14840                  *            character beyond 'ender'.
14841                  *  oldp      points to the beginning byte in the input of
14842                  *            'ender'.
14843                  *
14844                  * In the case of /il, we haven't folded anything that could be
14845                  * affected by the locale.  That means only above-Latin1
14846                  * characters that fold to other above-latin1 characters get
14847                  * folded at compile time.  To check where a good place to
14848                  * split nodes is, everything in it will have to be folded.
14849                  * The boolean 'maybe_exactfu' keeps track in /il if there are
14850                  * any unfolded characters in the node. */
14851                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14852
14853                 /* If we do need to fold the node, we need a place to store the
14854                  * folded copy, and a way to map back to the unfolded original
14855                  * */
14856                 char * locfold_buf = NULL;
14857                 Size_t * loc_correspondence = NULL;
14858
14859                 if (! need_to_fold_loc) {   /* The normal case.  Just
14860                                                initialize to the actual node */
14861                     e = s;
14862                     s_start = s0;
14863                     s = old_old_s;  /* Point to the beginning of the final char
14864                                        that fits in the node */
14865                 }
14866                 else {
14867
14868                     /* Here, we have filled a /il node, and there are unfolded
14869                      * characters in it.  If the runtime locale turns out to be
14870                      * UTF-8, there are possible multi-character folds, just
14871                      * like when not under /l.  The node hence can't terminate
14872                      * in the middle of such a fold.  To determine this, we
14873                      * have to create a folded copy of this node.  That means
14874                      * reparsing the node, folding everything assuming a UTF-8
14875                      * locale.  (If at runtime it isn't such a locale, the
14876                      * actions here wouldn't have been necessary, but we have
14877                      * to assume the worst case.)  If we find we need to back
14878                      * off the folded string, we do so, and then map that
14879                      * position back to the original unfolded node, which then
14880                      * gets output, truncated at that spot */
14881
14882                     char * redo_p = RExC_parse;
14883                     char * redo_e;
14884                     char * old_redo_e;
14885
14886                     /* Allow enough space assuming a single byte input folds to
14887                      * a single byte output, plus assume that the two unparsed
14888                      * characters (that we may need) fold to the largest number
14889                      * of bytes possible, plus extra for one more worst case
14890                      * scenario.  In the loop below, if we start eating into
14891                      * that final spare space, we enlarge this initial space */
14892                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14893
14894                     Newxz(locfold_buf, size, char);
14895                     Newxz(loc_correspondence, size, Size_t);
14896
14897                     /* Redo this node's parse, folding into 'locfold_buf' */
14898                     redo_p = RExC_parse;
14899                     old_redo_e = redo_e = locfold_buf;
14900                     while (redo_p <= oldp) {
14901
14902                         old_redo_e = redo_e;
14903                         loc_correspondence[redo_e - locfold_buf]
14904                                                         = redo_p - RExC_parse;
14905
14906                         if (UTF) {
14907                             Size_t added_len;
14908
14909                             (void) _to_utf8_fold_flags((U8 *) redo_p,
14910                                                        (U8 *) RExC_end,
14911                                                        (U8 *) redo_e,
14912                                                        &added_len,
14913                                                        FOLD_FLAGS_FULL);
14914                             redo_e += added_len;
14915                             redo_p += UTF8SKIP(redo_p);
14916                         }
14917                         else {
14918
14919                             /* Note that if this code is run on some ancient
14920                              * Unicode versions, SHARP S doesn't fold to 'ss',
14921                              * but rather than clutter the code with #ifdef's,
14922                              * as is done above, we ignore that possibility.
14923                              * This is ok because this code doesn't affect what
14924                              * gets matched, but merely where the node gets
14925                              * split */
14926                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14927                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14928                             }
14929                             else {
14930                                 *redo_e++ = 's';
14931                                 *redo_e++ = 's';
14932                             }
14933                             redo_p++;
14934                         }
14935
14936
14937                         /* If we're getting so close to the end that a
14938                          * worst-case fold in the next character would cause us
14939                          * to overflow, increase, assuming one byte output byte
14940                          * per one byte input one, plus room for another worst
14941                          * case fold */
14942                         if (   redo_p <= oldp
14943                             && redo_e > locfold_buf + size
14944                                                     - (UTF8_MAXBYTES_CASE + 1))
14945                         {
14946                             Size_t new_size = size
14947                                             + (oldp - redo_p)
14948                                             + UTF8_MAXBYTES_CASE + 1;
14949                             Ptrdiff_t e_offset = redo_e - locfold_buf;
14950
14951                             Renew(locfold_buf, new_size, char);
14952                             Renew(loc_correspondence, new_size, Size_t);
14953                             size = new_size;
14954
14955                             redo_e = locfold_buf + e_offset;
14956                         }
14957                     }
14958
14959                     /* Set so that things are in terms of the folded, temporary
14960                      * string */
14961                     s = old_redo_e;
14962                     s_start = locfold_buf;
14963                     e = redo_e;
14964
14965                 }
14966
14967                 /* Here, we have 's', 's_start' and 'e' set up to point to the
14968                  * input that goes into the node, folded.
14969                  *
14970                  * If the final character of the node and the fold of ender
14971                  * form the first two characters of a three character fold, we
14972                  * need to peek ahead at the next (unparsed) character in the
14973                  * input to determine if the three actually do form such a
14974                  * fold.  Just looking at that character is not generally
14975                  * sufficient, as it could be, for example, an escape sequence
14976                  * that evaluates to something else, and it needs to be folded.
14977                  *
14978                  * khw originally thought to just go through the parse loop one
14979                  * extra time, but that doesn't work easily as that iteration
14980                  * could cause things to think that the parse is over and to
14981                  * goto loopdone.  The character could be a '$' for example, or
14982                  * the character beyond could be a quantifier, and other
14983                  * glitches as well.
14984                  *
14985                  * The solution used here for peeking ahead is to look at that
14986                  * next character.  If it isn't ASCII punctuation, then it will
14987                  * be something that continues in an EXACTish node if there
14988                  * were space.  We append the fold of it to s, having reserved
14989                  * enough room in s0 for the purpose.  If we can't reasonably
14990                  * peek ahead, we instead assume the worst case: that it is
14991                  * something that would form the completion of a multi-char
14992                  * fold.
14993                  *
14994                  * If we can't split between s and ender, we work backwards
14995                  * character-by-character down to s0.  At each current point
14996                  * see if we are at the beginning of a multi-char fold.  If so,
14997                  * that means we would be splitting the fold across nodes, and
14998                  * so we back up one and try again.
14999                  *
15000                  * If we're not at the beginning, we still could be at the
15001                  * final two characters of a (rare) three character fold.  We
15002                  * check if the sequence starting at the character before the
15003                  * current position (and including the current and next
15004                  * characters) is a three character fold.  If not, the node can
15005                  * be split here.  If it is, we have to backup two characters
15006                  * and try again.
15007                  *
15008                  * Otherwise, the node can be split at the current position.
15009                  *
15010                  * The same logic is used for UTF-8 patterns and not */
15011                 if (UTF) {
15012                     Size_t added_len;
15013
15014                     /* Append the fold of ender */
15015                     (void) _to_uni_fold_flags(
15016                         ender,
15017                         (U8 *) e,
15018                         &added_len,
15019                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15020                                         ? FOLD_FLAGS_NOMIX_ASCII
15021                                         : 0));
15022                     e += added_len;
15023
15024                     /* 's' and the character folded to by ender may be the
15025                      * first two of a three-character fold, in which case the
15026                      * node should not be split here.  That may mean examining
15027                      * the so-far unparsed character starting at 'p'.  But if
15028                      * ender folded to more than one character, we already have
15029                      * three characters to look at.  Also, we first check if
15030                      * the sequence consisting of s and the next character form
15031                      * the first two of some three character fold.  If not,
15032                      * there's no need to peek ahead. */
15033                     if (   added_len <= UTF8SKIP(e - added_len)
15034                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15035                     {
15036                         /* Here, the two do form the beginning of a potential
15037                          * three character fold.  The unexamined character may
15038                          * or may not complete it.  Peek at it.  It might be
15039                          * something that ends the node or an escape sequence,
15040                          * in which case we don't know without a lot of work
15041                          * what it evaluates to, so we have to assume the worst
15042                          * case: that it does complete the fold, and so we
15043                          * can't split here.  All such instances  will have
15044                          * that character be an ASCII punctuation character,
15045                          * like a backslash.  So, for that case, backup one and
15046                          * drop down to try at that position */
15047                         if (isPUNCT(*p)) {
15048                             s = (char *) utf8_hop_back((U8 *) s, -1,
15049                                        (U8 *) s_start);
15050                             backed_up = TRUE;
15051                         }
15052                         else {
15053                             /* Here, since it's not punctuation, it must be a
15054                              * real character, and we can append its fold to
15055                              * 'e' (having deliberately reserved enough space
15056                              * for this eventuality) and drop down to check if
15057                              * the three actually do form a folded sequence */
15058                             (void) _to_utf8_fold_flags(
15059                                 (U8 *) p, (U8 *) RExC_end,
15060                                 (U8 *) e,
15061                                 &added_len,
15062                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15063                                                 ? FOLD_FLAGS_NOMIX_ASCII
15064                                                 : 0));
15065                             e += added_len;
15066                         }
15067                     }
15068
15069                     /* Here, we either have three characters available in
15070                      * sequence starting at 's', or we have two characters and
15071                      * know that the following one can't possibly be part of a
15072                      * three character fold.  We go through the node backwards
15073                      * until we find a place where we can split it without
15074                      * breaking apart a multi-character fold.  At any given
15075                      * point we have to worry about if such a fold begins at
15076                      * the current 's', and also if a three-character fold
15077                      * begins at s-1, (containing s and s+1).  Splitting in
15078                      * either case would break apart a fold */
15079                     do {
15080                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15081                                                             (U8 *) s_start);
15082
15083                         /* If is a multi-char fold, can't split here.  Backup
15084                          * one char and try again */
15085                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15086                             s = prev_s;
15087                             backed_up = TRUE;
15088                             continue;
15089                         }
15090
15091                         /* If the two characters beginning at 's' are part of a
15092                          * three character fold starting at the character
15093                          * before s, we can't split either before or after s.
15094                          * Backup two chars and try again */
15095                         if (   LIKELY(s > s_start)
15096                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15097                         {
15098                             s = prev_s;
15099                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15100                             backed_up = TRUE;
15101                             continue;
15102                         }
15103
15104                         /* Here there's no multi-char fold between s and the
15105                          * next character following it.  We can split */
15106                         splittable = TRUE;
15107                         break;
15108
15109                     } while (s > s_start); /* End of loops backing up through the node */
15110
15111                     /* Here we either couldn't find a place to split the node,
15112                      * or else we broke out of the loop setting 'splittable' to
15113                      * true.  In the latter case, the place to split is between
15114                      * the first and second characters in the sequence starting
15115                      * at 's' */
15116                     if (splittable) {
15117                         s += UTF8SKIP(s);
15118                     }
15119                 }
15120                 else {  /* Pattern not UTF-8 */
15121                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
15122                         || ASCII_FOLD_RESTRICTED)
15123                     {
15124                         assert( toLOWER_L1(ender) < 256 );
15125                         *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15126                     }
15127                     else {
15128                         *e++ = 's';
15129                         *e++ = 's';
15130                     }
15131
15132                     if (   e - s  <= 1
15133                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15134                     {
15135                         if (isPUNCT(*p)) {
15136                             s--;
15137                             backed_up = TRUE;
15138                         }
15139                         else {
15140                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15141                                 || ASCII_FOLD_RESTRICTED)
15142                             {
15143                                 assert( toLOWER_L1(ender) < 256 );
15144                                 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15145                             }
15146                             else {
15147                                 *e++ = 's';
15148                                 *e++ = 's';
15149                             }
15150                         }
15151                     }
15152
15153                     do {
15154                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15155                             s--;
15156                             backed_up = TRUE;
15157                             continue;
15158                         }
15159
15160                         if (   LIKELY(s > s_start)
15161                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15162                         {
15163                             s -= 2;
15164                             backed_up = TRUE;
15165                             continue;
15166                         }
15167
15168                         splittable = TRUE;
15169                         break;
15170
15171                     } while (s > s_start);
15172
15173                     if (splittable) {
15174                         s++;
15175                     }
15176                 }
15177
15178                 /* Here, we are done backing up.  If we didn't backup at all
15179                  * (the likely case), just proceed */
15180                 if (backed_up) {
15181
15182                    /* If we did find a place to split, reparse the entire node
15183                     * stopping where we have calculated. */
15184                     if (splittable) {
15185
15186                        /* If we created a temporary folded string under /l, we
15187                         * have to map that back to the original */
15188                         if (need_to_fold_loc) {
15189                             upper_fill = loc_correspondence[s - s_start];
15190                             if (upper_fill == 0) {
15191                                 FAIL2("panic: loc_correspondence[%d] is 0",
15192                                       (int) (s - s_start));
15193                             }
15194                             Safefree(locfold_buf);
15195                             Safefree(loc_correspondence);
15196                         }
15197                         else {
15198                             upper_fill = s - s0;
15199                         }
15200                         goto reparse;
15201                     }
15202
15203                     /* Here the node consists entirely of non-final multi-char
15204                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
15205                      * decent place to split it, so give up and just take the
15206                      * whole thing */
15207                     len = old_s - s0;
15208                 }
15209
15210                 if (need_to_fold_loc) {
15211                     Safefree(locfold_buf);
15212                     Safefree(loc_correspondence);
15213                 }
15214             }   /* End of verifying node ends with an appropriate char */
15215
15216             /* We need to start the next node at the character that didn't fit
15217              * in this one */
15218             p = oldp;
15219
15220           loopdone:   /* Jumped to when encounters something that shouldn't be
15221                          in the node */
15222
15223             /* Free up any over-allocated space; cast is to silence bogus
15224              * warning in MS VC */
15225             change_engine_size(pRExC_state,
15226                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15227
15228             /* I (khw) don't know if you can get here with zero length, but the
15229              * old code handled this situation by creating a zero-length EXACT
15230              * node.  Might as well be NOTHING instead */
15231             if (len == 0) {
15232                 OP(REGNODE_p(ret)) = NOTHING;
15233             }
15234             else {
15235
15236                 /* If the node type is EXACT here, check to see if it
15237                  * should be EXACTL, or EXACT_REQ8. */
15238                 if (node_type == EXACT) {
15239                     if (LOC) {
15240                         node_type = EXACTL;
15241                     }
15242                     else if (requires_utf8_target) {
15243                         node_type = EXACT_REQ8;
15244                     }
15245                 }
15246                 else if (node_type == LEXACT) {
15247                     if (requires_utf8_target) {
15248                         node_type = LEXACT_REQ8;
15249                     }
15250                 }
15251                 else if (FOLD) {
15252                     if (    UNLIKELY(has_micro_sign || has_ss)
15253                         && (node_type == EXACTFU || (   node_type == EXACTF
15254                                                      && maybe_exactfu)))
15255                     {   /* These two conditions are problematic in non-UTF-8
15256                            EXACTFU nodes. */
15257                         assert(! UTF);
15258                         node_type = EXACTFUP;
15259                     }
15260                     else if (node_type == EXACTFL) {
15261
15262                         /* 'maybe_exactfu' is deliberately set above to
15263                          * indicate this node type, where all code points in it
15264                          * are above 255 */
15265                         if (maybe_exactfu) {
15266                             node_type = EXACTFLU8;
15267                         }
15268                         else if (UNLIKELY(
15269                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15270                         {
15271                             /* A character that folds to more than one will
15272                              * match multiple characters, so can't be SIMPLE.
15273                              * We don't have to worry about this with EXACTFLU8
15274                              * nodes just above, as they have already been
15275                              * folded (since the fold doesn't vary at run
15276                              * time).  Here, if the final character in the node
15277                              * folds to multiple, it can't be simple.  (This
15278                              * only has an effect if the node has only a single
15279                              * character, hence the final one, as elsewhere we
15280                              * turn off simple for nodes whose length > 1 */
15281                             maybe_SIMPLE = 0;
15282                         }
15283                     }
15284                     else if (node_type == EXACTF) {  /* Means is /di */
15285
15286                         /* This intermediate variable is needed solely because
15287                          * the asserts in the macro where used exceed Win32's
15288                          * literal string capacity */
15289                         char first_char = * STRING(REGNODE_p(ret));
15290
15291                         /* If 'maybe_exactfu' is clear, then we need to stay
15292                          * /di.  If it is set, it means there are no code
15293                          * points that match differently depending on UTF8ness
15294                          * of the target string, so it can become an EXACTFU
15295                          * node */
15296                         if (! maybe_exactfu) {
15297                             RExC_seen_d_op = TRUE;
15298                         }
15299                         else if (   isALPHA_FOLD_EQ(first_char, 's')
15300                                  || isALPHA_FOLD_EQ(ender, 's'))
15301                         {
15302                             /* But, if the node begins or ends in an 's' we
15303                              * have to defer changing it into an EXACTFU, as
15304                              * the node could later get joined with another one
15305                              * that ends or begins with 's' creating an 'ss'
15306                              * sequence which would then wrongly match the
15307                              * sharp s without the target being UTF-8.  We
15308                              * create a special node that we resolve later when
15309                              * we join nodes together */
15310
15311                             node_type = EXACTFU_S_EDGE;
15312                         }
15313                         else {
15314                             node_type = EXACTFU;
15315                         }
15316                     }
15317
15318                     if (requires_utf8_target && node_type == EXACTFU) {
15319                         node_type = EXACTFU_REQ8;
15320                     }
15321                 }
15322
15323                 OP(REGNODE_p(ret)) = node_type;
15324                 setSTR_LEN(REGNODE_p(ret), len);
15325                 RExC_emit += STR_SZ(len);
15326
15327                 /* If the node isn't a single character, it can't be SIMPLE */
15328                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15329                     maybe_SIMPLE = 0;
15330                 }
15331
15332                 *flagp |= HASWIDTH | maybe_SIMPLE;
15333             }
15334
15335             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15336             RExC_parse = p;
15337
15338             {
15339                 /* len is STRLEN which is unsigned, need to copy to signed */
15340                 IV iv = len;
15341                 if (iv < 0)
15342                     vFAIL("Internal disaster");
15343             }
15344
15345         } /* End of label 'defchar:' */
15346         break;
15347     } /* End of giant switch on input character */
15348
15349     /* Position parse to next real character */
15350     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15351                                             FALSE /* Don't force to /x */ );
15352     if (   *RExC_parse == '{'
15353         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15354     {
15355         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15356             RExC_parse++;
15357             vFAIL("Unescaped left brace in regex is illegal here");
15358         }
15359         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15360                                   " passed through");
15361     }
15362
15363     return(ret);
15364 }
15365
15366
15367 STATIC void
15368 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15369 {
15370     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
15371      * sets up the bitmap and any flags, removing those code points from the
15372      * inversion list, setting it to NULL should it become completely empty */
15373
15374
15375     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15376     assert(PL_regkind[OP(node)] == ANYOF);
15377
15378     /* There is no bitmap for this node type */
15379     if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15380         return;
15381     }
15382
15383     ANYOF_BITMAP_ZERO(node);
15384     if (*invlist_ptr) {
15385
15386         /* This gets set if we actually need to modify things */
15387         bool change_invlist = FALSE;
15388
15389         UV start, end;
15390
15391         /* Start looking through *invlist_ptr */
15392         invlist_iterinit(*invlist_ptr);
15393         while (invlist_iternext(*invlist_ptr, &start, &end)) {
15394             UV high;
15395             int i;
15396
15397             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15398                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15399             }
15400
15401             /* Quit if are above what we should change */
15402             if (start >= NUM_ANYOF_CODE_POINTS) {
15403                 break;
15404             }
15405
15406             change_invlist = TRUE;
15407
15408             /* Set all the bits in the range, up to the max that we are doing */
15409             high = (end < NUM_ANYOF_CODE_POINTS - 1)
15410                    ? end
15411                    : NUM_ANYOF_CODE_POINTS - 1;
15412             for (i = start; i <= (int) high; i++) {
15413                 ANYOF_BITMAP_SET(node, i);
15414             }
15415         }
15416         invlist_iterfinish(*invlist_ptr);
15417
15418         /* Done with loop; remove any code points that are in the bitmap from
15419          * *invlist_ptr; similarly for code points above the bitmap if we have
15420          * a flag to match all of them anyways */
15421         if (change_invlist) {
15422             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15423         }
15424         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15425             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15426         }
15427
15428         /* If have completely emptied it, remove it completely */
15429         if (_invlist_len(*invlist_ptr) == 0) {
15430             SvREFCNT_dec_NN(*invlist_ptr);
15431             *invlist_ptr = NULL;
15432         }
15433     }
15434 }
15435
15436 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15437    Character classes ([:foo:]) can also be negated ([:^foo:]).
15438    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15439    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15440    but trigger failures because they are currently unimplemented. */
15441
15442 #define POSIXCC_DONE(c)   ((c) == ':')
15443 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15444 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15445 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15446
15447 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
15448 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
15449 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
15450
15451 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15452
15453 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15454  * routine. q.v. */
15455 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
15456         if (posix_warnings) {                                               \
15457             if (! RExC_warn_text ) RExC_warn_text =                         \
15458                                          (AV *) sv_2mortal((SV *) newAV()); \
15459             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
15460                                              WARNING_PREFIX                 \
15461                                              text                           \
15462                                              REPORT_LOCATION,               \
15463                                              REPORT_LOCATION_ARGS(p)));     \
15464         }                                                                   \
15465     } STMT_END
15466 #define CLEAR_POSIX_WARNINGS()                                              \
15467     STMT_START {                                                            \
15468         if (posix_warnings && RExC_warn_text)                               \
15469             av_clear(RExC_warn_text);                                       \
15470     } STMT_END
15471
15472 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15473     STMT_START {                                                            \
15474         CLEAR_POSIX_WARNINGS();                                             \
15475         return ret;                                                         \
15476     } STMT_END
15477
15478 STATIC int
15479 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15480
15481     const char * const s,      /* Where the putative posix class begins.
15482                                   Normally, this is one past the '['.  This
15483                                   parameter exists so it can be somewhere
15484                                   besides RExC_parse. */
15485     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15486                                   NULL */
15487     AV ** posix_warnings,      /* Where to place any generated warnings, or
15488                                   NULL */
15489     const bool check_only      /* Don't die if error */
15490 )
15491 {
15492     /* This parses what the caller thinks may be one of the three POSIX
15493      * constructs:
15494      *  1) a character class, like [:blank:]
15495      *  2) a collating symbol, like [. .]
15496      *  3) an equivalence class, like [= =]
15497      * In the latter two cases, it croaks if it finds a syntactically legal
15498      * one, as these are not handled by Perl.
15499      *
15500      * The main purpose is to look for a POSIX character class.  It returns:
15501      *  a) the class number
15502      *      if it is a completely syntactically and semantically legal class.
15503      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15504      *      closing ']' of the class
15505      *  b) OOB_NAMEDCLASS
15506      *      if it appears that one of the three POSIX constructs was meant, but
15507      *      its specification was somehow defective.  'updated_parse_ptr', if
15508      *      not NULL, is set to point to the character just after the end
15509      *      character of the class.  See below for handling of warnings.
15510      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15511      *      if it  doesn't appear that a POSIX construct was intended.
15512      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15513      *      raised.
15514      *
15515      * In b) there may be errors or warnings generated.  If 'check_only' is
15516      * TRUE, then any errors are discarded.  Warnings are returned to the
15517      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15518      * instead it is NULL, warnings are suppressed.
15519      *
15520      * The reason for this function, and its complexity is that a bracketed
15521      * character class can contain just about anything.  But it's easy to
15522      * mistype the very specific posix class syntax but yielding a valid
15523      * regular bracketed class, so it silently gets compiled into something
15524      * quite unintended.
15525      *
15526      * The solution adopted here maintains backward compatibility except that
15527      * it adds a warning if it looks like a posix class was intended but
15528      * improperly specified.  The warning is not raised unless what is input
15529      * very closely resembles one of the 14 legal posix classes.  To do this,
15530      * it uses fuzzy parsing.  It calculates how many single-character edits it
15531      * would take to transform what was input into a legal posix class.  Only
15532      * if that number is quite small does it think that the intention was a
15533      * posix class.  Obviously these are heuristics, and there will be cases
15534      * where it errs on one side or another, and they can be tweaked as
15535      * experience informs.
15536      *
15537      * The syntax for a legal posix class is:
15538      *
15539      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15540      *
15541      * What this routine considers syntactically to be an intended posix class
15542      * is this (the comments indicate some restrictions that the pattern
15543      * doesn't show):
15544      *
15545      *  qr/(?x: \[?                         # The left bracket, possibly
15546      *                                      # omitted
15547      *          \h*                         # possibly followed by blanks
15548      *          (?: \^ \h* )?               # possibly a misplaced caret
15549      *          [:;]?                       # The opening class character,
15550      *                                      # possibly omitted.  A typo
15551      *                                      # semi-colon can also be used.
15552      *          \h*
15553      *          \^?                         # possibly a correctly placed
15554      *                                      # caret, but not if there was also
15555      *                                      # a misplaced one
15556      *          \h*
15557      *          .{3,15}                     # The class name.  If there are
15558      *                                      # deviations from the legal syntax,
15559      *                                      # its edit distance must be close
15560      *                                      # to a real class name in order
15561      *                                      # for it to be considered to be
15562      *                                      # an intended posix class.
15563      *          \h*
15564      *          [[:punct:]]?                # The closing class character,
15565      *                                      # possibly omitted.  If not a colon
15566      *                                      # nor semi colon, the class name
15567      *                                      # must be even closer to a valid
15568      *                                      # one
15569      *          \h*
15570      *          \]?                         # The right bracket, possibly
15571      *                                      # omitted.
15572      *     )/
15573      *
15574      * In the above, \h must be ASCII-only.
15575      *
15576      * These are heuristics, and can be tweaked as field experience dictates.
15577      * There will be cases when someone didn't intend to specify a posix class
15578      * that this warns as being so.  The goal is to minimize these, while
15579      * maximizing the catching of things intended to be a posix class that
15580      * aren't parsed as such.
15581      */
15582
15583     const char* p             = s;
15584     const char * const e      = RExC_end;
15585     unsigned complement       = 0;      /* If to complement the class */
15586     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15587     bool has_opening_bracket  = FALSE;
15588     bool has_opening_colon    = FALSE;
15589     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15590                                                    valid class */
15591     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15592     const char* name_start;             /* ptr to class name first char */
15593
15594     /* If the number of single-character typos the input name is away from a
15595      * legal name is no more than this number, it is considered to have meant
15596      * the legal name */
15597     int max_distance          = 2;
15598
15599     /* to store the name.  The size determines the maximum length before we
15600      * decide that no posix class was intended.  Should be at least
15601      * sizeof("alphanumeric") */
15602     UV input_text[15];
15603     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15604
15605     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15606
15607     CLEAR_POSIX_WARNINGS();
15608
15609     if (p >= e) {
15610         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15611     }
15612
15613     if (*(p - 1) != '[') {
15614         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15615         found_problem = TRUE;
15616     }
15617     else {
15618         has_opening_bracket = TRUE;
15619     }
15620
15621     /* They could be confused and think you can put spaces between the
15622      * components */
15623     if (isBLANK(*p)) {
15624         found_problem = TRUE;
15625
15626         do {
15627             p++;
15628         } while (p < e && isBLANK(*p));
15629
15630         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15631     }
15632
15633     /* For [. .] and [= =].  These are quite different internally from [: :],
15634      * so they are handled separately.  */
15635     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15636                                             and 1 for at least one char in it
15637                                           */
15638     {
15639         const char open_char  = *p;
15640         const char * temp_ptr = p + 1;
15641
15642         /* These two constructs are not handled by perl, and if we find a
15643          * syntactically valid one, we croak.  khw, who wrote this code, finds
15644          * this explanation of them very unclear:
15645          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15646          * And searching the rest of the internet wasn't very helpful either.
15647          * It looks like just about any byte can be in these constructs,
15648          * depending on the locale.  But unless the pattern is being compiled
15649          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15650          * In that case, it looks like [= =] isn't allowed at all, and that
15651          * [. .] could be any single code point, but for longer strings the
15652          * constituent characters would have to be the ASCII alphabetics plus
15653          * the minus-hyphen.  Any sensible locale definition would limit itself
15654          * to these.  And any portable one definitely should.  Trying to parse
15655          * the general case is a nightmare (see [perl #127604]).  So, this code
15656          * looks only for interiors of these constructs that match:
15657          *      qr/.|[-\w]{2,}/
15658          * Using \w relaxes the apparent rules a little, without adding much
15659          * danger of mistaking something else for one of these constructs.
15660          *
15661          * [. .] in some implementations described on the internet is usable to
15662          * escape a character that otherwise is special in bracketed character
15663          * classes.  For example [.].] means a literal right bracket instead of
15664          * the ending of the class
15665          *
15666          * [= =] can legitimately contain a [. .] construct, but we don't
15667          * handle this case, as that [. .] construct will later get parsed
15668          * itself and croak then.  And [= =] is checked for even when not under
15669          * /l, as Perl has long done so.
15670          *
15671          * The code below relies on there being a trailing NUL, so it doesn't
15672          * have to keep checking if the parse ptr < e.
15673          */
15674         if (temp_ptr[1] == open_char) {
15675             temp_ptr++;
15676         }
15677         else while (    temp_ptr < e
15678                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15679         {
15680             temp_ptr++;
15681         }
15682
15683         if (*temp_ptr == open_char) {
15684             temp_ptr++;
15685             if (*temp_ptr == ']') {
15686                 temp_ptr++;
15687                 if (! found_problem && ! check_only) {
15688                     RExC_parse = (char *) temp_ptr;
15689                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15690                             "extensions", open_char, open_char);
15691                 }
15692
15693                 /* Here, the syntax wasn't completely valid, or else the call
15694                  * is to check-only */
15695                 if (updated_parse_ptr) {
15696                     *updated_parse_ptr = (char *) temp_ptr;
15697                 }
15698
15699                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15700             }
15701         }
15702
15703         /* If we find something that started out to look like one of these
15704          * constructs, but isn't, we continue below so that it can be checked
15705          * for being a class name with a typo of '.' or '=' instead of a colon.
15706          * */
15707     }
15708
15709     /* Here, we think there is a possibility that a [: :] class was meant, and
15710      * we have the first real character.  It could be they think the '^' comes
15711      * first */
15712     if (*p == '^') {
15713         found_problem = TRUE;
15714         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15715         complement = 1;
15716         p++;
15717
15718         if (isBLANK(*p)) {
15719             found_problem = TRUE;
15720
15721             do {
15722                 p++;
15723             } while (p < e && isBLANK(*p));
15724
15725             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15726         }
15727     }
15728
15729     /* But the first character should be a colon, which they could have easily
15730      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15731      * distinguish from a colon, so treat that as a colon).  */
15732     if (*p == ':') {
15733         p++;
15734         has_opening_colon = TRUE;
15735     }
15736     else if (*p == ';') {
15737         found_problem = TRUE;
15738         p++;
15739         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15740         has_opening_colon = TRUE;
15741     }
15742     else {
15743         found_problem = TRUE;
15744         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15745
15746         /* Consider an initial punctuation (not one of the recognized ones) to
15747          * be a left terminator */
15748         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15749             p++;
15750         }
15751     }
15752
15753     /* They may think that you can put spaces between the components */
15754     if (isBLANK(*p)) {
15755         found_problem = TRUE;
15756
15757         do {
15758             p++;
15759         } while (p < e && isBLANK(*p));
15760
15761         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15762     }
15763
15764     if (*p == '^') {
15765
15766         /* We consider something like [^:^alnum:]] to not have been intended to
15767          * be a posix class, but XXX maybe we should */
15768         if (complement) {
15769             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15770         }
15771
15772         complement = 1;
15773         p++;
15774     }
15775
15776     /* Again, they may think that you can put spaces between the components */
15777     if (isBLANK(*p)) {
15778         found_problem = TRUE;
15779
15780         do {
15781             p++;
15782         } while (p < e && isBLANK(*p));
15783
15784         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15785     }
15786
15787     if (*p == ']') {
15788
15789         /* XXX This ']' may be a typo, and something else was meant.  But
15790          * treating it as such creates enough complications, that that
15791          * possibility isn't currently considered here.  So we assume that the
15792          * ']' is what is intended, and if we've already found an initial '[',
15793          * this leaves this construct looking like [:] or [:^], which almost
15794          * certainly weren't intended to be posix classes */
15795         if (has_opening_bracket) {
15796             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15797         }
15798
15799         /* But this function can be called when we parse the colon for
15800          * something like qr/[alpha:]]/, so we back up to look for the
15801          * beginning */
15802         p--;
15803
15804         if (*p == ';') {
15805             found_problem = TRUE;
15806             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15807         }
15808         else if (*p != ':') {
15809
15810             /* XXX We are currently very restrictive here, so this code doesn't
15811              * consider the possibility that, say, /[alpha.]]/ was intended to
15812              * be a posix class. */
15813             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15814         }
15815
15816         /* Here we have something like 'foo:]'.  There was no initial colon,
15817          * and we back up over 'foo.  XXX Unlike the going forward case, we
15818          * don't handle typos of non-word chars in the middle */
15819         has_opening_colon = FALSE;
15820         p--;
15821
15822         while (p > RExC_start && isWORDCHAR(*p)) {
15823             p--;
15824         }
15825         p++;
15826
15827         /* Here, we have positioned ourselves to where we think the first
15828          * character in the potential class is */
15829     }
15830
15831     /* Now the interior really starts.  There are certain key characters that
15832      * can end the interior, or these could just be typos.  To catch both
15833      * cases, we may have to do two passes.  In the first pass, we keep on
15834      * going unless we come to a sequence that matches
15835      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15836      * This means it takes a sequence to end the pass, so two typos in a row if
15837      * that wasn't what was intended.  If the class is perfectly formed, just
15838      * this one pass is needed.  We also stop if there are too many characters
15839      * being accumulated, but this number is deliberately set higher than any
15840      * real class.  It is set high enough so that someone who thinks that
15841      * 'alphanumeric' is a correct name would get warned that it wasn't.
15842      * While doing the pass, we keep track of where the key characters were in
15843      * it.  If we don't find an end to the class, and one of the key characters
15844      * was found, we redo the pass, but stop when we get to that character.
15845      * Thus the key character was considered a typo in the first pass, but a
15846      * terminator in the second.  If two key characters are found, we stop at
15847      * the second one in the first pass.  Again this can miss two typos, but
15848      * catches a single one
15849      *
15850      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15851      * point to the first key character.  For the second pass, it starts as -1.
15852      * */
15853
15854     name_start = p;
15855   parse_name:
15856     {
15857         bool has_blank               = FALSE;
15858         bool has_upper               = FALSE;
15859         bool has_terminating_colon   = FALSE;
15860         bool has_terminating_bracket = FALSE;
15861         bool has_semi_colon          = FALSE;
15862         unsigned int name_len        = 0;
15863         int punct_count              = 0;
15864
15865         while (p < e) {
15866
15867             /* Squeeze out blanks when looking up the class name below */
15868             if (isBLANK(*p) ) {
15869                 has_blank = TRUE;
15870                 found_problem = TRUE;
15871                 p++;
15872                 continue;
15873             }
15874
15875             /* The name will end with a punctuation */
15876             if (isPUNCT(*p)) {
15877                 const char * peek = p + 1;
15878
15879                 /* Treat any non-']' punctuation followed by a ']' (possibly
15880                  * with intervening blanks) as trying to terminate the class.
15881                  * ']]' is very likely to mean a class was intended (but
15882                  * missing the colon), but the warning message that gets
15883                  * generated shows the error position better if we exit the
15884                  * loop at the bottom (eventually), so skip it here. */
15885                 if (*p != ']') {
15886                     if (peek < e && isBLANK(*peek)) {
15887                         has_blank = TRUE;
15888                         found_problem = TRUE;
15889                         do {
15890                             peek++;
15891                         } while (peek < e && isBLANK(*peek));
15892                     }
15893
15894                     if (peek < e && *peek == ']') {
15895                         has_terminating_bracket = TRUE;
15896                         if (*p == ':') {
15897                             has_terminating_colon = TRUE;
15898                         }
15899                         else if (*p == ';') {
15900                             has_semi_colon = TRUE;
15901                             has_terminating_colon = TRUE;
15902                         }
15903                         else {
15904                             found_problem = TRUE;
15905                         }
15906                         p = peek + 1;
15907                         goto try_posix;
15908                     }
15909                 }
15910
15911                 /* Here we have punctuation we thought didn't end the class.
15912                  * Keep track of the position of the key characters that are
15913                  * more likely to have been class-enders */
15914                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15915
15916                     /* Allow just one such possible class-ender not actually
15917                      * ending the class. */
15918                     if (possible_end) {
15919                         break;
15920                     }
15921                     possible_end = p;
15922                 }
15923
15924                 /* If we have too many punctuation characters, no use in
15925                  * keeping going */
15926                 if (++punct_count > max_distance) {
15927                     break;
15928                 }
15929
15930                 /* Treat the punctuation as a typo. */
15931                 input_text[name_len++] = *p;
15932                 p++;
15933             }
15934             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15935                 input_text[name_len++] = toLOWER(*p);
15936                 has_upper = TRUE;
15937                 found_problem = TRUE;
15938                 p++;
15939             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15940                 input_text[name_len++] = *p;
15941                 p++;
15942             }
15943             else {
15944                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15945                 p+= UTF8SKIP(p);
15946             }
15947
15948             /* The declaration of 'input_text' is how long we allow a potential
15949              * class name to be, before saying they didn't mean a class name at
15950              * all */
15951             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15952                 break;
15953             }
15954         }
15955
15956         /* We get to here when the possible class name hasn't been properly
15957          * terminated before:
15958          *   1) we ran off the end of the pattern; or
15959          *   2) found two characters, each of which might have been intended to
15960          *      be the name's terminator
15961          *   3) found so many punctuation characters in the purported name,
15962          *      that the edit distance to a valid one is exceeded
15963          *   4) we decided it was more characters than anyone could have
15964          *      intended to be one. */
15965
15966         found_problem = TRUE;
15967
15968         /* In the final two cases, we know that looking up what we've
15969          * accumulated won't lead to a match, even a fuzzy one. */
15970         if (   name_len >= C_ARRAY_LENGTH(input_text)
15971             || punct_count > max_distance)
15972         {
15973             /* If there was an intermediate key character that could have been
15974              * an intended end, redo the parse, but stop there */
15975             if (possible_end && possible_end != (char *) -1) {
15976                 possible_end = (char *) -1; /* Special signal value to say
15977                                                we've done a first pass */
15978                 p = name_start;
15979                 goto parse_name;
15980             }
15981
15982             /* Otherwise, it can't have meant to have been a class */
15983             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15984         }
15985
15986         /* If we ran off the end, and the final character was a punctuation
15987          * one, back up one, to look at that final one just below.  Later, we
15988          * will restore the parse pointer if appropriate */
15989         if (name_len && p == e && isPUNCT(*(p-1))) {
15990             p--;
15991             name_len--;
15992         }
15993
15994         if (p < e && isPUNCT(*p)) {
15995             if (*p == ']') {
15996                 has_terminating_bracket = TRUE;
15997
15998                 /* If this is a 2nd ']', and the first one is just below this
15999                  * one, consider that to be the real terminator.  This gives a
16000                  * uniform and better positioning for the warning message  */
16001                 if (   possible_end
16002                     && possible_end != (char *) -1
16003                     && *possible_end == ']'
16004                     && name_len && input_text[name_len - 1] == ']')
16005                 {
16006                     name_len--;
16007                     p = possible_end;
16008
16009                     /* And this is actually equivalent to having done the 2nd
16010                      * pass now, so set it to not try again */
16011                     possible_end = (char *) -1;
16012                 }
16013             }
16014             else {
16015                 if (*p == ':') {
16016                     has_terminating_colon = TRUE;
16017                 }
16018                 else if (*p == ';') {
16019                     has_semi_colon = TRUE;
16020                     has_terminating_colon = TRUE;
16021                 }
16022                 p++;
16023             }
16024         }
16025
16026     try_posix:
16027
16028         /* Here, we have a class name to look up.  We can short circuit the
16029          * stuff below for short names that can't possibly be meant to be a
16030          * class name.  (We can do this on the first pass, as any second pass
16031          * will yield an even shorter name) */
16032         if (name_len < 3) {
16033             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16034         }
16035
16036         /* Find which class it is.  Initially switch on the length of the name.
16037          * */
16038         switch (name_len) {
16039             case 4:
16040                 if (memEQs(name_start, 4, "word")) {
16041                     /* this is not POSIX, this is the Perl \w */
16042                     class_number = ANYOF_WORDCHAR;
16043                 }
16044                 break;
16045             case 5:
16046                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16047                  *                        graph lower print punct space upper
16048                  * Offset 4 gives the best switch position.  */
16049                 switch (name_start[4]) {
16050                     case 'a':
16051                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
16052                             class_number = ANYOF_ALPHA;
16053                         break;
16054                     case 'e':
16055                         if (memBEGINs(name_start, 5, "spac")) /* space */
16056                             class_number = ANYOF_SPACE;
16057                         break;
16058                     case 'h':
16059                         if (memBEGINs(name_start, 5, "grap")) /* graph */
16060                             class_number = ANYOF_GRAPH;
16061                         break;
16062                     case 'i':
16063                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
16064                             class_number = ANYOF_ASCII;
16065                         break;
16066                     case 'k':
16067                         if (memBEGINs(name_start, 5, "blan")) /* blank */
16068                             class_number = ANYOF_BLANK;
16069                         break;
16070                     case 'l':
16071                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16072                             class_number = ANYOF_CNTRL;
16073                         break;
16074                     case 'm':
16075                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16076                             class_number = ANYOF_ALPHANUMERIC;
16077                         break;
16078                     case 'r':
16079                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
16080                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16081                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16082                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16083                         break;
16084                     case 't':
16085                         if (memBEGINs(name_start, 5, "digi")) /* digit */
16086                             class_number = ANYOF_DIGIT;
16087                         else if (memBEGINs(name_start, 5, "prin")) /* print */
16088                             class_number = ANYOF_PRINT;
16089                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
16090                             class_number = ANYOF_PUNCT;
16091                         break;
16092                 }
16093                 break;
16094             case 6:
16095                 if (memEQs(name_start, 6, "xdigit"))
16096                     class_number = ANYOF_XDIGIT;
16097                 break;
16098         }
16099
16100         /* If the name exactly matches a posix class name the class number will
16101          * here be set to it, and the input almost certainly was meant to be a
16102          * posix class, so we can skip further checking.  If instead the syntax
16103          * is exactly correct, but the name isn't one of the legal ones, we
16104          * will return that as an error below.  But if neither of these apply,
16105          * it could be that no posix class was intended at all, or that one
16106          * was, but there was a typo.  We tease these apart by doing fuzzy
16107          * matching on the name */
16108         if (class_number == OOB_NAMEDCLASS && found_problem) {
16109             const UV posix_names[][6] = {
16110                                                 { 'a', 'l', 'n', 'u', 'm' },
16111                                                 { 'a', 'l', 'p', 'h', 'a' },
16112                                                 { 'a', 's', 'c', 'i', 'i' },
16113                                                 { 'b', 'l', 'a', 'n', 'k' },
16114                                                 { 'c', 'n', 't', 'r', 'l' },
16115                                                 { 'd', 'i', 'g', 'i', 't' },
16116                                                 { 'g', 'r', 'a', 'p', 'h' },
16117                                                 { 'l', 'o', 'w', 'e', 'r' },
16118                                                 { 'p', 'r', 'i', 'n', 't' },
16119                                                 { 'p', 'u', 'n', 'c', 't' },
16120                                                 { 's', 'p', 'a', 'c', 'e' },
16121                                                 { 'u', 'p', 'p', 'e', 'r' },
16122                                                 { 'w', 'o', 'r', 'd' },
16123                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
16124                                             };
16125             /* The names of the above all have added NULs to make them the same
16126              * size, so we need to also have the real lengths */
16127             const UV posix_name_lengths[] = {
16128                                                 sizeof("alnum") - 1,
16129                                                 sizeof("alpha") - 1,
16130                                                 sizeof("ascii") - 1,
16131                                                 sizeof("blank") - 1,
16132                                                 sizeof("cntrl") - 1,
16133                                                 sizeof("digit") - 1,
16134                                                 sizeof("graph") - 1,
16135                                                 sizeof("lower") - 1,
16136                                                 sizeof("print") - 1,
16137                                                 sizeof("punct") - 1,
16138                                                 sizeof("space") - 1,
16139                                                 sizeof("upper") - 1,
16140                                                 sizeof("word")  - 1,
16141                                                 sizeof("xdigit")- 1
16142                                             };
16143             unsigned int i;
16144             int temp_max = max_distance;    /* Use a temporary, so if we
16145                                                reparse, we haven't changed the
16146                                                outer one */
16147
16148             /* Use a smaller max edit distance if we are missing one of the
16149              * delimiters */
16150             if (   has_opening_bracket + has_opening_colon < 2
16151                 || has_terminating_bracket + has_terminating_colon < 2)
16152             {
16153                 temp_max--;
16154             }
16155
16156             /* See if the input name is close to a legal one */
16157             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16158
16159                 /* Short circuit call if the lengths are too far apart to be
16160                  * able to match */
16161                 if (abs( (int) (name_len - posix_name_lengths[i]))
16162                     > temp_max)
16163                 {
16164                     continue;
16165                 }
16166
16167                 if (edit_distance(input_text,
16168                                   posix_names[i],
16169                                   name_len,
16170                                   posix_name_lengths[i],
16171                                   temp_max
16172                                  )
16173                     > -1)
16174                 { /* If it is close, it probably was intended to be a class */
16175                     goto probably_meant_to_be;
16176                 }
16177             }
16178
16179             /* Here the input name is not close enough to a valid class name
16180              * for us to consider it to be intended to be a posix class.  If
16181              * we haven't already done so, and the parse found a character that
16182              * could have been terminators for the name, but which we absorbed
16183              * as typos during the first pass, repeat the parse, signalling it
16184              * to stop at that character */
16185             if (possible_end && possible_end != (char *) -1) {
16186                 possible_end = (char *) -1;
16187                 p = name_start;
16188                 goto parse_name;
16189             }
16190
16191             /* Here neither pass found a close-enough class name */
16192             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16193         }
16194
16195     probably_meant_to_be:
16196
16197         /* Here we think that a posix specification was intended.  Update any
16198          * parse pointer */
16199         if (updated_parse_ptr) {
16200             *updated_parse_ptr = (char *) p;
16201         }
16202
16203         /* If a posix class name was intended but incorrectly specified, we
16204          * output or return the warnings */
16205         if (found_problem) {
16206
16207             /* We set flags for these issues in the parse loop above instead of
16208              * adding them to the list of warnings, because we can parse it
16209              * twice, and we only want one warning instance */
16210             if (has_upper) {
16211                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16212             }
16213             if (has_blank) {
16214                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16215             }
16216             if (has_semi_colon) {
16217                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16218             }
16219             else if (! has_terminating_colon) {
16220                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16221             }
16222             if (! has_terminating_bracket) {
16223                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16224             }
16225
16226             if (   posix_warnings
16227                 && RExC_warn_text
16228                 && av_count(RExC_warn_text) > 0)
16229             {
16230                 *posix_warnings = RExC_warn_text;
16231             }
16232         }
16233         else if (class_number != OOB_NAMEDCLASS) {
16234             /* If it is a known class, return the class.  The class number
16235              * #defines are structured so each complement is +1 to the normal
16236              * one */
16237             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16238         }
16239         else if (! check_only) {
16240
16241             /* Here, it is an unrecognized class.  This is an error (unless the
16242             * call is to check only, which we've already handled above) */
16243             const char * const complement_string = (complement)
16244                                                    ? "^"
16245                                                    : "";
16246             RExC_parse = (char *) p;
16247             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16248                         complement_string,
16249                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16250         }
16251     }
16252
16253     return OOB_NAMEDCLASS;
16254 }
16255 #undef ADD_POSIX_WARNING
16256
16257 STATIC unsigned  int
16258 S_regex_set_precedence(const U8 my_operator) {
16259
16260     /* Returns the precedence in the (?[...]) construct of the input operator,
16261      * specified by its character representation.  The precedence follows
16262      * general Perl rules, but it extends this so that ')' and ']' have (low)
16263      * precedence even though they aren't really operators */
16264
16265     switch (my_operator) {
16266         case '!':
16267             return 5;
16268         case '&':
16269             return 4;
16270         case '^':
16271         case '|':
16272         case '+':
16273         case '-':
16274             return 3;
16275         case ')':
16276             return 2;
16277         case ']':
16278             return 1;
16279     }
16280
16281     NOT_REACHED; /* NOTREACHED */
16282     return 0;   /* Silence compiler warning */
16283 }
16284
16285 STATIC regnode_offset
16286 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16287                     I32 *flagp, U32 depth,
16288                     char * const oregcomp_parse)
16289 {
16290     /* Handle the (?[...]) construct to do set operations */
16291
16292     U8 curchar;                     /* Current character being parsed */
16293     UV start, end;                  /* End points of code point ranges */
16294     SV* final = NULL;               /* The end result inversion list */
16295     SV* result_string;              /* 'final' stringified */
16296     AV* stack;                      /* stack of operators and operands not yet
16297                                        resolved */
16298     AV* fence_stack = NULL;         /* A stack containing the positions in
16299                                        'stack' of where the undealt-with left
16300                                        parens would be if they were actually
16301                                        put there */
16302     /* The 'volatile' is a workaround for an optimiser bug
16303      * in Solaris Studio 12.3. See RT #127455 */
16304     volatile IV fence = 0;          /* Position of where most recent undealt-
16305                                        with left paren in stack is; -1 if none.
16306                                      */
16307     STRLEN len;                     /* Temporary */
16308     regnode_offset node;            /* Temporary, and final regnode returned by
16309                                        this function */
16310     const bool save_fold = FOLD;    /* Temporary */
16311     char *save_end, *save_parse;    /* Temporaries */
16312     const bool in_locale = LOC;     /* we turn off /l during processing */
16313
16314     DECLARE_AND_GET_RE_DEBUG_FLAGS;
16315
16316     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16317     PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16318
16319     DEBUG_PARSE("xcls");
16320
16321     if (in_locale) {
16322         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16323     }
16324
16325     /* The use of this operator implies /u.  This is required so that the
16326      * compile time values are valid in all runtime cases */
16327     REQUIRE_UNI_RULES(flagp, 0);
16328
16329     ckWARNexperimental(RExC_parse,
16330                        WARN_EXPERIMENTAL__REGEX_SETS,
16331                        "The regex_sets feature is experimental");
16332
16333     /* Everything in this construct is a metacharacter.  Operands begin with
16334      * either a '\' (for an escape sequence), or a '[' for a bracketed
16335      * character class.  Any other character should be an operator, or
16336      * parenthesis for grouping.  Both types of operands are handled by calling
16337      * regclass() to parse them.  It is called with a parameter to indicate to
16338      * return the computed inversion list.  The parsing here is implemented via
16339      * a stack.  Each entry on the stack is a single character representing one
16340      * of the operators; or else a pointer to an operand inversion list. */
16341
16342 #define IS_OPERATOR(a) SvIOK(a)
16343 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
16344
16345     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
16346      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16347      * with pronouncing it called it Reverse Polish instead, but now that YOU
16348      * know how to pronounce it you can use the correct term, thus giving due
16349      * credit to the person who invented it, and impressing your geek friends.
16350      * Wikipedia says that the pronounciation of "Ł" has been changing so that
16351      * it is now more like an English initial W (as in wonk) than an L.)
16352      *
16353      * This means that, for example, 'a | b & c' is stored on the stack as
16354      *
16355      * c  [4]
16356      * b  [3]
16357      * &  [2]
16358      * a  [1]
16359      * |  [0]
16360      *
16361      * where the numbers in brackets give the stack [array] element number.
16362      * In this implementation, parentheses are not stored on the stack.
16363      * Instead a '(' creates a "fence" so that the part of the stack below the
16364      * fence is invisible except to the corresponding ')' (this allows us to
16365      * replace testing for parens, by using instead subtraction of the fence
16366      * position).  As new operands are processed they are pushed onto the stack
16367      * (except as noted in the next paragraph).  New operators of higher
16368      * precedence than the current final one are inserted on the stack before
16369      * the lhs operand (so that when the rhs is pushed next, everything will be
16370      * in the correct positions shown above.  When an operator of equal or
16371      * lower precedence is encountered in parsing, all the stacked operations
16372      * of equal or higher precedence are evaluated, leaving the result as the
16373      * top entry on the stack.  This makes higher precedence operations
16374      * evaluate before lower precedence ones, and causes operations of equal
16375      * precedence to left associate.
16376      *
16377      * The only unary operator '!' is immediately pushed onto the stack when
16378      * encountered.  When an operand is encountered, if the top of the stack is
16379      * a '!", the complement is immediately performed, and the '!' popped.  The
16380      * resulting value is treated as a new operand, and the logic in the
16381      * previous paragraph is executed.  Thus in the expression
16382      *      [a] + ! [b]
16383      * the stack looks like
16384      *
16385      * !
16386      * a
16387      * +
16388      *
16389      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16390      * becomes
16391      *
16392      * !b
16393      * a
16394      * +
16395      *
16396      * A ')' is treated as an operator with lower precedence than all the
16397      * aforementioned ones, which causes all operations on the stack above the
16398      * corresponding '(' to be evaluated down to a single resultant operand.
16399      * Then the fence for the '(' is removed, and the operand goes through the
16400      * algorithm above, without the fence.
16401      *
16402      * A separate stack is kept of the fence positions, so that the position of
16403      * the latest so-far unbalanced '(' is at the top of it.
16404      *
16405      * The ']' ending the construct is treated as the lowest operator of all,
16406      * so that everything gets evaluated down to a single operand, which is the
16407      * result */
16408
16409     sv_2mortal((SV *)(stack = newAV()));
16410     sv_2mortal((SV *)(fence_stack = newAV()));
16411
16412     while (RExC_parse < RExC_end) {
16413         I32 top_index;              /* Index of top-most element in 'stack' */
16414         SV** top_ptr;               /* Pointer to top 'stack' element */
16415         SV* current = NULL;         /* To contain the current inversion list
16416                                        operand */
16417         SV* only_to_avoid_leaks;
16418
16419         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16420                                 TRUE /* Force /x */ );
16421         if (RExC_parse >= RExC_end) {   /* Fail */
16422             break;
16423         }
16424
16425         curchar = UCHARAT(RExC_parse);
16426
16427 redo_curchar:
16428
16429 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16430                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16431         DEBUG_U(dump_regex_sets_structures(pRExC_state,
16432                                            stack, fence, fence_stack));
16433 #endif
16434
16435         top_index = av_tindex_skip_len_mg(stack);
16436
16437         switch (curchar) {
16438             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
16439             char stacked_operator;  /* The topmost operator on the 'stack'. */
16440             SV* lhs;                /* Operand to the left of the operator */
16441             SV* rhs;                /* Operand to the right of the operator */
16442             SV* fence_ptr;          /* Pointer to top element of the fence
16443                                        stack */
16444             case '(':
16445
16446                 if (   RExC_parse < RExC_end - 2
16447                     && UCHARAT(RExC_parse + 1) == '?'
16448                     && UCHARAT(RExC_parse + 2) == '^')
16449                 {
16450                     const regnode_offset orig_emit = RExC_emit;
16451                     SV * resultant_invlist;
16452
16453                     /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16454                      * This happens when we have some thing like
16455                      *
16456                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16457                      *   ...
16458                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
16459                      *
16460                      * Here we would be handling the interpolated
16461                      * '$thai_or_lao'.  We handle this by a recursive call to
16462                      * reg which returns the inversion list the
16463                      * interpolated expression evaluates to.  Actually, the
16464                      * return is a special regnode containing a pointer to that
16465                      * inversion list.  If the return isn't that regnode alone,
16466                      * we know that this wasn't such an interpolation, which is
16467                      * an error: we need to get a single inversion list back
16468                      * from the recursion */
16469
16470                     RExC_parse++;
16471                     RExC_sets_depth++;
16472
16473                     node = reg(pRExC_state, 2, flagp, depth+1);
16474                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16475
16476                     if (   OP(REGNODE_p(node)) != REGEX_SET
16477                            /* If more than a single node returned, the nested
16478                             * parens evaluated to more than just a (?[...]),
16479                             * which isn't legal */
16480                         || RExC_emit != orig_emit
16481                                       + NODE_STEP_REGNODE
16482                                       + regarglen[REGEX_SET])
16483                     {
16484                         vFAIL("Expecting interpolated extended charclass");
16485                     }
16486                     resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16487                     current = invlist_clone(resultant_invlist, NULL);
16488                     SvREFCNT_dec(resultant_invlist);
16489
16490                     RExC_sets_depth--;
16491                     RExC_emit = orig_emit;
16492                     goto handle_operand;
16493                 }
16494
16495                 /* A regular '('.  Look behind for illegal syntax */
16496                 if (top_index - fence >= 0) {
16497                     /* If the top entry on the stack is an operator, it had
16498                      * better be a '!', otherwise the entry below the top
16499                      * operand should be an operator */
16500                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16501                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16502                         || (   IS_OPERAND(*top_ptr)
16503                             && (   top_index - fence < 1
16504                                 || ! (stacked_ptr = av_fetch(stack,
16505                                                              top_index - 1,
16506                                                              FALSE))
16507                                 || ! IS_OPERATOR(*stacked_ptr))))
16508                     {
16509                         RExC_parse++;
16510                         vFAIL("Unexpected '(' with no preceding operator");
16511                     }
16512                 }
16513
16514                 /* Stack the position of this undealt-with left paren */
16515                 av_push(fence_stack, newSViv(fence));
16516                 fence = top_index + 1;
16517                 break;
16518
16519             case '\\':
16520                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16521                  * multi-char folds are allowed.  */
16522                 if (!regclass(pRExC_state, flagp, depth+1,
16523                               TRUE, /* means parse just the next thing */
16524                               FALSE, /* don't allow multi-char folds */
16525                               FALSE, /* don't silence non-portable warnings.  */
16526                               TRUE,  /* strict */
16527                               FALSE, /* Require return to be an ANYOF */
16528                               &current))
16529                 {
16530                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16531                     goto regclass_failed;
16532                 }
16533
16534                 assert(current);
16535
16536                 /* regclass() will return with parsing just the \ sequence,
16537                  * leaving the parse pointer at the next thing to parse */
16538                 RExC_parse--;
16539                 goto handle_operand;
16540
16541             case '[':   /* Is a bracketed character class */
16542             {
16543                 /* See if this is a [:posix:] class. */
16544                 bool is_posix_class = (OOB_NAMEDCLASS
16545                             < handle_possible_posix(pRExC_state,
16546                                                 RExC_parse + 1,
16547                                                 NULL,
16548                                                 NULL,
16549                                                 TRUE /* checking only */));
16550                 /* If it is a posix class, leave the parse pointer at the '['
16551                  * to fool regclass() into thinking it is part of a
16552                  * '[[:posix:]]'. */
16553                 if (! is_posix_class) {
16554                     RExC_parse++;
16555                 }
16556
16557                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16558                  * multi-char folds are allowed.  */
16559                 if (!regclass(pRExC_state, flagp, depth+1,
16560                                 is_posix_class, /* parse the whole char
16561                                                     class only if not a
16562                                                     posix class */
16563                                 FALSE, /* don't allow multi-char folds */
16564                                 TRUE, /* silence non-portable warnings. */
16565                                 TRUE, /* strict */
16566                                 FALSE, /* Require return to be an ANYOF */
16567                                 &current))
16568                 {
16569                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16570                     goto regclass_failed;
16571                 }
16572
16573                 assert(current);
16574
16575                 /* function call leaves parse pointing to the ']', except if we
16576                  * faked it */
16577                 if (is_posix_class) {
16578                     RExC_parse--;
16579                 }
16580
16581                 goto handle_operand;
16582             }
16583
16584             case ']':
16585                 if (top_index >= 1) {
16586                     goto join_operators;
16587                 }
16588
16589                 /* Only a single operand on the stack: are done */
16590                 goto done;
16591
16592             case ')':
16593                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16594                     if (UCHARAT(RExC_parse - 1) == ']')  {
16595                         break;
16596                     }
16597                     RExC_parse++;
16598                     vFAIL("Unexpected ')'");
16599                 }
16600
16601                 /* If nothing after the fence, is missing an operand */
16602                 if (top_index - fence < 0) {
16603                     RExC_parse++;
16604                     goto bad_syntax;
16605                 }
16606                 /* If at least two things on the stack, treat this as an
16607                   * operator */
16608                 if (top_index - fence >= 1) {
16609                     goto join_operators;
16610                 }
16611
16612                 /* Here only a single thing on the fenced stack, and there is a
16613                  * fence.  Get rid of it */
16614                 fence_ptr = av_pop(fence_stack);
16615                 assert(fence_ptr);
16616                 fence = SvIV(fence_ptr);
16617                 SvREFCNT_dec_NN(fence_ptr);
16618                 fence_ptr = NULL;
16619
16620                 if (fence < 0) {
16621                     fence = 0;
16622                 }
16623
16624                 /* Having gotten rid of the fence, we pop the operand at the
16625                  * stack top and process it as a newly encountered operand */
16626                 current = av_pop(stack);
16627                 if (IS_OPERAND(current)) {
16628                     goto handle_operand;
16629                 }
16630
16631                 RExC_parse++;
16632                 goto bad_syntax;
16633
16634             case '&':
16635             case '|':
16636             case '+':
16637             case '-':
16638             case '^':
16639
16640                 /* These binary operators should have a left operand already
16641                  * parsed */
16642                 if (   top_index - fence < 0
16643                     || top_index - fence == 1
16644                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16645                     || ! IS_OPERAND(*top_ptr))
16646                 {
16647                     goto unexpected_binary;
16648                 }
16649
16650                 /* If only the one operand is on the part of the stack visible
16651                  * to us, we just place this operator in the proper position */
16652                 if (top_index - fence < 2) {
16653
16654                     /* Place the operator before the operand */
16655
16656                     SV* lhs = av_pop(stack);
16657                     av_push(stack, newSVuv(curchar));
16658                     av_push(stack, lhs);
16659                     break;
16660                 }
16661
16662                 /* But if there is something else on the stack, we need to
16663                  * process it before this new operator if and only if the
16664                  * stacked operation has equal or higher precedence than the
16665                  * new one */
16666
16667              join_operators:
16668
16669                 /* The operator on the stack is supposed to be below both its
16670                  * operands */
16671                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16672                     || IS_OPERAND(*stacked_ptr))
16673                 {
16674                     /* But if not, it's legal and indicates we are completely
16675                      * done if and only if we're currently processing a ']',
16676                      * which should be the final thing in the expression */
16677                     if (curchar == ']') {
16678                         goto done;
16679                     }
16680
16681                   unexpected_binary:
16682                     RExC_parse++;
16683                     vFAIL2("Unexpected binary operator '%c' with no "
16684                            "preceding operand", curchar);
16685                 }
16686                 stacked_operator = (char) SvUV(*stacked_ptr);
16687
16688                 if (regex_set_precedence(curchar)
16689                     > regex_set_precedence(stacked_operator))
16690                 {
16691                     /* Here, the new operator has higher precedence than the
16692                      * stacked one.  This means we need to add the new one to
16693                      * the stack to await its rhs operand (and maybe more
16694                      * stuff).  We put it before the lhs operand, leaving
16695                      * untouched the stacked operator and everything below it
16696                      * */
16697                     lhs = av_pop(stack);
16698                     assert(IS_OPERAND(lhs));
16699
16700                     av_push(stack, newSVuv(curchar));
16701                     av_push(stack, lhs);
16702                     break;
16703                 }
16704
16705                 /* Here, the new operator has equal or lower precedence than
16706                  * what's already there.  This means the operation already
16707                  * there should be performed now, before the new one. */
16708
16709                 rhs = av_pop(stack);
16710                 if (! IS_OPERAND(rhs)) {
16711
16712                     /* This can happen when a ! is not followed by an operand,
16713                      * like in /(?[\t &!])/ */
16714                     goto bad_syntax;
16715                 }
16716
16717                 lhs = av_pop(stack);
16718
16719                 if (! IS_OPERAND(lhs)) {
16720
16721                     /* This can happen when there is an empty (), like in
16722                      * /(?[[0]+()+])/ */
16723                     goto bad_syntax;
16724                 }
16725
16726                 switch (stacked_operator) {
16727                     case '&':
16728                         _invlist_intersection(lhs, rhs, &rhs);
16729                         break;
16730
16731                     case '|':
16732                     case '+':
16733                         _invlist_union(lhs, rhs, &rhs);
16734                         break;
16735
16736                     case '-':
16737                         _invlist_subtract(lhs, rhs, &rhs);
16738                         break;
16739
16740                     case '^':   /* The union minus the intersection */
16741                     {
16742                         SV* i = NULL;
16743                         SV* u = NULL;
16744
16745                         _invlist_union(lhs, rhs, &u);
16746                         _invlist_intersection(lhs, rhs, &i);
16747                         _invlist_subtract(u, i, &rhs);
16748                         SvREFCNT_dec_NN(i);
16749                         SvREFCNT_dec_NN(u);
16750                         break;
16751                     }
16752                 }
16753                 SvREFCNT_dec(lhs);
16754
16755                 /* Here, the higher precedence operation has been done, and the
16756                  * result is in 'rhs'.  We overwrite the stacked operator with
16757                  * the result.  Then we redo this code to either push the new
16758                  * operator onto the stack or perform any higher precedence
16759                  * stacked operation */
16760                 only_to_avoid_leaks = av_pop(stack);
16761                 SvREFCNT_dec(only_to_avoid_leaks);
16762                 av_push(stack, rhs);
16763                 goto redo_curchar;
16764
16765             case '!':   /* Highest priority, right associative */
16766
16767                 /* If what's already at the top of the stack is another '!",
16768                  * they just cancel each other out */
16769                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16770                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16771                 {
16772                     only_to_avoid_leaks = av_pop(stack);
16773                     SvREFCNT_dec(only_to_avoid_leaks);
16774                 }
16775                 else { /* Otherwise, since it's right associative, just push
16776                           onto the stack */
16777                     av_push(stack, newSVuv(curchar));
16778                 }
16779                 break;
16780
16781             default:
16782                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16783                 if (RExC_parse >= RExC_end) {
16784                     break;
16785                 }
16786                 vFAIL("Unexpected character");
16787
16788           handle_operand:
16789
16790             /* Here 'current' is the operand.  If something is already on the
16791              * stack, we have to check if it is a !.  But first, the code above
16792              * may have altered the stack in the time since we earlier set
16793              * 'top_index'.  */
16794
16795             top_index = av_tindex_skip_len_mg(stack);
16796             if (top_index - fence >= 0) {
16797                 /* If the top entry on the stack is an operator, it had better
16798                  * be a '!', otherwise the entry below the top operand should
16799                  * be an operator */
16800                 top_ptr = av_fetch(stack, top_index, FALSE);
16801                 assert(top_ptr);
16802                 if (IS_OPERATOR(*top_ptr)) {
16803
16804                     /* The only permissible operator at the top of the stack is
16805                      * '!', which is applied immediately to this operand. */
16806                     curchar = (char) SvUV(*top_ptr);
16807                     if (curchar != '!') {
16808                         SvREFCNT_dec(current);
16809                         vFAIL2("Unexpected binary operator '%c' with no "
16810                                 "preceding operand", curchar);
16811                     }
16812
16813                     _invlist_invert(current);
16814
16815                     only_to_avoid_leaks = av_pop(stack);
16816                     SvREFCNT_dec(only_to_avoid_leaks);
16817
16818                     /* And we redo with the inverted operand.  This allows
16819                      * handling multiple ! in a row */
16820                     goto handle_operand;
16821                 }
16822                           /* Single operand is ok only for the non-binary ')'
16823                            * operator */
16824                 else if ((top_index - fence == 0 && curchar != ')')
16825                          || (top_index - fence > 0
16826                              && (! (stacked_ptr = av_fetch(stack,
16827                                                            top_index - 1,
16828                                                            FALSE))
16829                                  || IS_OPERAND(*stacked_ptr))))
16830                 {
16831                     SvREFCNT_dec(current);
16832                     vFAIL("Operand with no preceding operator");
16833                 }
16834             }
16835
16836             /* Here there was nothing on the stack or the top element was
16837              * another operand.  Just add this new one */
16838             av_push(stack, current);
16839
16840         } /* End of switch on next parse token */
16841
16842         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16843     } /* End of loop parsing through the construct */
16844
16845     vFAIL("Syntax error in (?[...])");
16846
16847   done:
16848
16849     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16850         if (RExC_parse < RExC_end) {
16851             RExC_parse++;
16852         }
16853
16854         vFAIL("Unexpected ']' with no following ')' in (?[...");
16855     }
16856
16857     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16858         vFAIL("Unmatched (");
16859     }
16860
16861     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16862         || ((final = av_pop(stack)) == NULL)
16863         || ! IS_OPERAND(final)
16864         || ! is_invlist(final)
16865         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16866     {
16867       bad_syntax:
16868         SvREFCNT_dec(final);
16869         vFAIL("Incomplete expression within '(?[ ])'");
16870     }
16871
16872     /* Here, 'final' is the resultant inversion list from evaluating the
16873      * expression.  Return it if so requested */
16874     if (return_invlist) {
16875         *return_invlist = final;
16876         return END;
16877     }
16878
16879     if (RExC_sets_depth) {  /* If within a recursive call, return in a special
16880                                regnode */
16881         RExC_parse++;
16882         node = regpnode(pRExC_state, REGEX_SET, final);
16883     }
16884     else {
16885
16886         /* Otherwise generate a resultant node, based on 'final'.  regclass()
16887          * is expecting a string of ranges and individual code points */
16888         invlist_iterinit(final);
16889         result_string = newSVpvs("");
16890         while (invlist_iternext(final, &start, &end)) {
16891             if (start == end) {
16892                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16893             }
16894             else {
16895                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16896                                                         UVXf "}", start, end);
16897             }
16898         }
16899
16900         /* About to generate an ANYOF (or similar) node from the inversion list
16901          * we have calculated */
16902         save_parse = RExC_parse;
16903         RExC_parse = SvPV(result_string, len);
16904         save_end = RExC_end;
16905         RExC_end = RExC_parse + len;
16906         TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16907
16908         /* We turn off folding around the call, as the class we have
16909          * constructed already has all folding taken into consideration, and we
16910          * don't want regclass() to add to that */
16911         RExC_flags &= ~RXf_PMf_FOLD;
16912         /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16913          * folds are allowed.  */
16914         node = regclass(pRExC_state, flagp, depth+1,
16915                         FALSE, /* means parse the whole char class */
16916                         FALSE, /* don't allow multi-char folds */
16917                         TRUE, /* silence non-portable warnings.  The above may
16918                                  very well have generated non-portable code
16919                                  points, but they're valid on this machine */
16920                         FALSE, /* similarly, no need for strict */
16921
16922                         /* We can optimize into something besides an ANYOF,
16923                          * except under /l, which needs to be ANYOF because of
16924                          * runtime checks for locale sanity, etc */
16925                     ! in_locale,
16926                         NULL
16927                     );
16928
16929         RESTORE_WARNINGS;
16930         RExC_parse = save_parse + 1;
16931         RExC_end = save_end;
16932         SvREFCNT_dec_NN(final);
16933         SvREFCNT_dec_NN(result_string);
16934
16935         if (save_fold) {
16936             RExC_flags |= RXf_PMf_FOLD;
16937         }
16938
16939         if (!node) {
16940             RETURN_FAIL_ON_RESTART(*flagp, flagp);
16941             goto regclass_failed;
16942         }
16943
16944         /* Fix up the node type if we are in locale.  (We have pretended we are
16945          * under /u for the purposes of regclass(), as this construct will only
16946          * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
16947          * (so as to cause any warnings about bad locales to be output in
16948          * regexec.c), and add the flag that indicates to check if not in a
16949          * UTF-8 locale.  The reason we above forbid optimization into
16950          * something other than an ANYOF node is simply to minimize the number
16951          * of code changes in regexec.c.  Otherwise we would have to create new
16952          * EXACTish node types and deal with them.  This decision could be
16953          * revisited should this construct become popular.
16954          *
16955          * (One might think we could look at the resulting ANYOF node and
16956          * suppress the flag if everything is above 255, as those would be
16957          * UTF-8 only, but this isn't true, as the components that led to that
16958          * result could have been locale-affected, and just happen to cancel
16959          * each other out under UTF-8 locales.) */
16960         if (in_locale) {
16961             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16962
16963             assert(OP(REGNODE_p(node)) == ANYOF);
16964
16965             OP(REGNODE_p(node)) = ANYOFL;
16966             ANYOF_FLAGS(REGNODE_p(node))
16967                     |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16968         }
16969     }
16970
16971     nextchar(pRExC_state);
16972     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16973     return node;
16974
16975   regclass_failed:
16976     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16977                                                                 (UV) *flagp);
16978 }
16979
16980 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16981
16982 STATIC void
16983 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16984                              AV * stack, const IV fence, AV * fence_stack)
16985 {   /* Dumps the stacks in handle_regex_sets() */
16986
16987     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16988     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16989     SSize_t i;
16990
16991     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16992
16993     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16994
16995     if (stack_top < 0) {
16996         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16997     }
16998     else {
16999         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17000         for (i = stack_top; i >= 0; i--) {
17001             SV ** element_ptr = av_fetch(stack, i, FALSE);
17002             if (! element_ptr) {
17003             }
17004
17005             if (IS_OPERATOR(*element_ptr)) {
17006                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17007                                             (int) i, (int) SvIV(*element_ptr));
17008             }
17009             else {
17010                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17011                 sv_dump(*element_ptr);
17012             }
17013         }
17014     }
17015
17016     if (fence_stack_top < 0) {
17017         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17018     }
17019     else {
17020         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17021         for (i = fence_stack_top; i >= 0; i--) {
17022             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17023             if (! element_ptr) {
17024             }
17025
17026             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17027                                             (int) i, (int) SvIV(*element_ptr));
17028         }
17029     }
17030 }
17031
17032 #endif
17033
17034 #undef IS_OPERATOR
17035 #undef IS_OPERAND
17036
17037 STATIC void
17038 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17039 {
17040     /* This adds the Latin1/above-Latin1 folding rules.
17041      *
17042      * This should be called only for a Latin1-range code points, cp, which is
17043      * known to be involved in a simple fold with other code points above
17044      * Latin1.  It would give false results if /aa has been specified.
17045      * Multi-char folds are outside the scope of this, and must be handled
17046      * specially. */
17047
17048     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17049
17050     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17051
17052     /* The rules that are valid for all Unicode versions are hard-coded in */
17053     switch (cp) {
17054         case 'k':
17055         case 'K':
17056           *invlist =
17057              add_cp_to_invlist(*invlist, KELVIN_SIGN);
17058             break;
17059         case 's':
17060         case 'S':
17061           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17062             break;
17063         case MICRO_SIGN:
17064           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17065           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17066             break;
17067         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17068         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17069           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17070             break;
17071         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17072           *invlist = add_cp_to_invlist(*invlist,
17073                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17074             break;
17075
17076         default:    /* Other code points are checked against the data for the
17077                        current Unicode version */
17078           {
17079             Size_t folds_count;
17080             U32 first_fold;
17081             const U32 * remaining_folds;
17082             UV folded_cp;
17083
17084             if (isASCII(cp)) {
17085                 folded_cp = toFOLD(cp);
17086             }
17087             else {
17088                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17089                 Size_t dummy_len;
17090                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17091             }
17092
17093             if (folded_cp > 255) {
17094                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17095             }
17096
17097             folds_count = _inverse_folds(folded_cp, &first_fold,
17098                                                     &remaining_folds);
17099             if (folds_count == 0) {
17100
17101                 /* Use deprecated warning to increase the chances of this being
17102                  * output */
17103                 ckWARN2reg_d(RExC_parse,
17104                         "Perl folding rules are not up-to-date for 0x%02X;"
17105                         " please use the perlbug utility to report;", cp);
17106             }
17107             else {
17108                 unsigned int i;
17109
17110                 if (first_fold > 255) {
17111                     *invlist = add_cp_to_invlist(*invlist, first_fold);
17112                 }
17113                 for (i = 0; i < folds_count - 1; i++) {
17114                     if (remaining_folds[i] > 255) {
17115                         *invlist = add_cp_to_invlist(*invlist,
17116                                                     remaining_folds[i]);
17117                     }
17118                 }
17119             }
17120             break;
17121          }
17122     }
17123 }
17124
17125 STATIC void
17126 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17127 {
17128     /* Output the elements of the array given by '*posix_warnings' as REGEXP
17129      * warnings. */
17130
17131     SV * msg;
17132     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17133
17134     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17135
17136     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17137         CLEAR_POSIX_WARNINGS();
17138         return;
17139     }
17140
17141     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17142         if (first_is_fatal) {           /* Avoid leaking this */
17143             av_undef(posix_warnings);   /* This isn't necessary if the
17144                                             array is mortal, but is a
17145                                             fail-safe */
17146             (void) sv_2mortal(msg);
17147             PREPARE_TO_DIE;
17148         }
17149         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17150         SvREFCNT_dec_NN(msg);
17151     }
17152
17153     UPDATE_WARNINGS_LOC(RExC_parse);
17154 }
17155
17156 PERL_STATIC_INLINE Size_t
17157 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17158 {
17159     const U8 * const start = s1;
17160     const U8 * const send = start + max;
17161
17162     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17163
17164     while (s1 < send && *s1  == *s2) {
17165         s1++; s2++;
17166     }
17167
17168     return s1 - start;
17169 }
17170
17171
17172 STATIC AV *
17173 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17174 {
17175     /* This adds the string scalar <multi_string> to the array
17176      * <multi_char_matches>.  <multi_string> is known to have exactly
17177      * <cp_count> code points in it.  This is used when constructing a
17178      * bracketed character class and we find something that needs to match more
17179      * than a single character.
17180      *
17181      * <multi_char_matches> is actually an array of arrays.  Each top-level
17182      * element is an array that contains all the strings known so far that are
17183      * the same length.  And that length (in number of code points) is the same
17184      * as the index of the top-level array.  Hence, the [2] element is an
17185      * array, each element thereof is a string containing TWO code points;
17186      * while element [3] is for strings of THREE characters, and so on.  Since
17187      * this is for multi-char strings there can never be a [0] nor [1] element.
17188      *
17189      * When we rewrite the character class below, we will do so such that the
17190      * longest strings are written first, so that it prefers the longest
17191      * matching strings first.  This is done even if it turns out that any
17192      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
17193      * Christiansen has agreed that this is ok.  This makes the test for the
17194      * ligature 'ffi' come before the test for 'ff', for example */
17195
17196     AV* this_array;
17197     AV** this_array_ptr;
17198
17199     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17200
17201     if (! multi_char_matches) {
17202         multi_char_matches = newAV();
17203     }
17204
17205     if (av_exists(multi_char_matches, cp_count)) {
17206         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17207         this_array = *this_array_ptr;
17208     }
17209     else {
17210         this_array = newAV();
17211         av_store(multi_char_matches, cp_count,
17212                  (SV*) this_array);
17213     }
17214     av_push(this_array, multi_string);
17215
17216     return multi_char_matches;
17217 }
17218
17219 /* The names of properties whose definitions are not known at compile time are
17220  * stored in this SV, after a constant heading.  So if the length has been
17221  * changed since initialization, then there is a run-time definition. */
17222 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
17223                                         (SvCUR(listsv) != initial_listsv_len)
17224
17225 /* There is a restricted set of white space characters that are legal when
17226  * ignoring white space in a bracketed character class.  This generates the
17227  * code to skip them.
17228  *
17229  * There is a line below that uses the same white space criteria but is outside
17230  * this macro.  Both here and there must use the same definition */
17231 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
17232     STMT_START {                                                        \
17233         if (do_skip) {                                                  \
17234             while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
17235             {                                                           \
17236                 p++;                                                    \
17237             }                                                           \
17238         }                                                               \
17239     } STMT_END
17240
17241 STATIC regnode_offset
17242 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17243                  const bool stop_at_1,  /* Just parse the next thing, don't
17244                                            look for a full character class */
17245                  bool allow_mutiple_chars,
17246                  const bool silence_non_portable,   /* Don't output warnings
17247                                                        about too large
17248                                                        characters */
17249                  const bool strict,
17250                  bool optimizable,                  /* ? Allow a non-ANYOF return
17251                                                        node */
17252                  SV** ret_invlist  /* Return an inversion list, not a node */
17253           )
17254 {
17255     /* parse a bracketed class specification.  Most of these will produce an
17256      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17257      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
17258      * under /i with multi-character folds: it will be rewritten following the
17259      * paradigm of this example, where the <multi-fold>s are characters which
17260      * fold to multiple character sequences:
17261      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17262      * gets effectively rewritten as:
17263      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17264      * reg() gets called (recursively) on the rewritten version, and this
17265      * function will return what it constructs.  (Actually the <multi-fold>s
17266      * aren't physically removed from the [abcdefghi], it's just that they are
17267      * ignored in the recursion by means of a flag:
17268      * <RExC_in_multi_char_class>.)
17269      *
17270      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17271      * characters, with the corresponding bit set if that character is in the
17272      * list.  For characters above this, an inversion list is used.  There
17273      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17274      * determinable at compile time
17275      *
17276      * On success, returns the offset at which any next node should be placed
17277      * into the regex engine program being compiled.
17278      *
17279      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17280      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17281      * UTF-8
17282      */
17283
17284     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17285     IV range = 0;
17286     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17287     regnode_offset ret = -1;    /* Initialized to an illegal value */
17288     STRLEN numlen;
17289     int namedclass = OOB_NAMEDCLASS;
17290     char *rangebegin = NULL;
17291     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
17292                                aren't available at the time this was called */
17293     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17294                                       than just initialized.  */
17295     SV* properties = NULL;    /* Code points that match \p{} \P{} */
17296     SV* posixes = NULL;     /* Code points that match classes like [:word:],
17297                                extended beyond the Latin1 range.  These have to
17298                                be kept separate from other code points for much
17299                                of this function because their handling  is
17300                                different under /i, and for most classes under
17301                                /d as well */
17302     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
17303                                separate for a while from the non-complemented
17304                                versions because of complications with /d
17305                                matching */
17306     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17307                                   treated more simply than the general case,
17308                                   leading to less compilation and execution
17309                                   work */
17310     UV element_count = 0;   /* Number of distinct elements in the class.
17311                                Optimizations may be possible if this is tiny */
17312     AV * multi_char_matches = NULL; /* Code points that fold to more than one
17313                                        character; used under /i */
17314     UV n;
17315     char * stop_ptr = RExC_end;    /* where to stop parsing */
17316
17317     /* ignore unescaped whitespace? */
17318     const bool skip_white = cBOOL(   ret_invlist
17319                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17320
17321     /* inversion list of code points this node matches only when the target
17322      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
17323      * /d) */
17324     SV* upper_latin1_only_utf8_matches = NULL;
17325
17326     /* Inversion list of code points this node matches regardless of things
17327      * like locale, folding, utf8ness of the target string */
17328     SV* cp_list = NULL;
17329
17330     /* Like cp_list, but code points on this list need to be checked for things
17331      * that fold to/from them under /i */
17332     SV* cp_foldable_list = NULL;
17333
17334     /* Like cp_list, but code points on this list are valid only when the
17335      * runtime locale is UTF-8 */
17336     SV* only_utf8_locale_list = NULL;
17337
17338     /* In a range, if one of the endpoints is non-character-set portable,
17339      * meaning that it hard-codes a code point that may mean a different
17340      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17341      * mnemonic '\t' which each mean the same character no matter which
17342      * character set the platform is on. */
17343     unsigned int non_portable_endpoint = 0;
17344
17345     /* Is the range unicode? which means on a platform that isn't 1-1 native
17346      * to Unicode (i.e. non-ASCII), each code point in it should be considered
17347      * to be a Unicode value.  */
17348     bool unicode_range = FALSE;
17349     bool invert = FALSE;    /* Is this class to be complemented */
17350
17351     bool warn_super = ALWAYS_WARN_SUPER;
17352
17353     const char * orig_parse = RExC_parse;
17354
17355     /* This variable is used to mark where the end in the input is of something
17356      * that looks like a POSIX construct but isn't.  During the parse, when
17357      * something looks like it could be such a construct is encountered, it is
17358      * checked for being one, but not if we've already checked this area of the
17359      * input.  Only after this position is reached do we check again */
17360     char *not_posix_region_end = RExC_parse - 1;
17361
17362     AV* posix_warnings = NULL;
17363     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17364     U8 op = END;    /* The returned node-type, initialized to an impossible
17365                        one.  */
17366     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
17367     U32 posixl = 0;       /* bit field of posix classes matched under /l */
17368
17369
17370 /* Flags as to what things aren't knowable until runtime.  (Note that these are
17371  * mutually exclusive.) */
17372 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
17373                                             haven't been defined as of yet */
17374 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
17375                                             UTF-8 or not */
17376 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
17377                                             what gets folded */
17378     U32 has_runtime_dependency = 0;     /* OR of the above flags */
17379
17380     DECLARE_AND_GET_RE_DEBUG_FLAGS;
17381
17382     PERL_ARGS_ASSERT_REGCLASS;
17383 #ifndef DEBUGGING
17384     PERL_UNUSED_ARG(depth);
17385 #endif
17386
17387     assert(! (ret_invlist && allow_mutiple_chars));
17388
17389     /* If wants an inversion list returned, we can't optimize to something
17390      * else. */
17391     if (ret_invlist) {
17392         optimizable = FALSE;
17393     }
17394
17395     DEBUG_PARSE("clas");
17396
17397 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
17398     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
17399                                    && UNICODE_DOT_DOT_VERSION == 0)
17400     allow_mutiple_chars = FALSE;
17401 #endif
17402
17403     /* We include the /i status at the beginning of this so that we can
17404      * know it at runtime */
17405     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17406     initial_listsv_len = SvCUR(listsv);
17407     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
17408
17409     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17410
17411     assert(RExC_parse <= RExC_end);
17412
17413     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
17414         RExC_parse++;
17415         invert = TRUE;
17416         allow_mutiple_chars = FALSE;
17417         MARK_NAUGHTY(1);
17418         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17419     }
17420
17421     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17422     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17423         int maybe_class = handle_possible_posix(pRExC_state,
17424                                                 RExC_parse,
17425                                                 &not_posix_region_end,
17426                                                 NULL,
17427                                                 TRUE /* checking only */);
17428         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17429             ckWARN4reg(not_posix_region_end,
17430                     "POSIX syntax [%c %c] belongs inside character classes%s",
17431                     *RExC_parse, *RExC_parse,
17432                     (maybe_class == OOB_NAMEDCLASS)
17433                     ? ((POSIXCC_NOTYET(*RExC_parse))
17434                         ? " (but this one isn't implemented)"
17435                         : " (but this one isn't fully valid)")
17436                     : ""
17437                     );
17438         }
17439     }
17440
17441     /* If the caller wants us to just parse a single element, accomplish this
17442      * by faking the loop ending condition */
17443     if (stop_at_1 && RExC_end > RExC_parse) {
17444         stop_ptr = RExC_parse + 1;
17445     }
17446
17447     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17448     if (UCHARAT(RExC_parse) == ']')
17449         goto charclassloop;
17450
17451     while (1) {
17452
17453         if (   posix_warnings
17454             && av_tindex_skip_len_mg(posix_warnings) >= 0
17455             && RExC_parse > not_posix_region_end)
17456         {
17457             /* Warnings about posix class issues are considered tentative until
17458              * we are far enough along in the parse that we can no longer
17459              * change our mind, at which point we output them.  This is done
17460              * each time through the loop so that a later class won't zap them
17461              * before they have been dealt with. */
17462             output_posix_warnings(pRExC_state, posix_warnings);
17463         }
17464
17465         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17466
17467         if  (RExC_parse >= stop_ptr) {
17468             break;
17469         }
17470
17471         if  (UCHARAT(RExC_parse) == ']') {
17472             break;
17473         }
17474
17475       charclassloop:
17476
17477         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17478         save_value = value;
17479         save_prevvalue = prevvalue;
17480
17481         if (!range) {
17482             rangebegin = RExC_parse;
17483             element_count++;
17484             non_portable_endpoint = 0;
17485         }
17486         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17487             value = utf8n_to_uvchr((U8*)RExC_parse,
17488                                    RExC_end - RExC_parse,
17489                                    &numlen, UTF8_ALLOW_DEFAULT);
17490             RExC_parse += numlen;
17491         }
17492         else
17493             value = UCHARAT(RExC_parse++);
17494
17495         if (value == '[') {
17496             char * posix_class_end;
17497             namedclass = handle_possible_posix(pRExC_state,
17498                                                RExC_parse,
17499                                                &posix_class_end,
17500                                                do_posix_warnings ? &posix_warnings : NULL,
17501                                                FALSE    /* die if error */);
17502             if (namedclass > OOB_NAMEDCLASS) {
17503
17504                 /* If there was an earlier attempt to parse this particular
17505                  * posix class, and it failed, it was a false alarm, as this
17506                  * successful one proves */
17507                 if (   posix_warnings
17508                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17509                     && not_posix_region_end >= RExC_parse
17510                     && not_posix_region_end <= posix_class_end)
17511                 {
17512                     av_undef(posix_warnings);
17513                 }
17514
17515                 RExC_parse = posix_class_end;
17516             }
17517             else if (namedclass == OOB_NAMEDCLASS) {
17518                 not_posix_region_end = posix_class_end;
17519             }
17520             else {
17521                 namedclass = OOB_NAMEDCLASS;
17522             }
17523         }
17524         else if (   RExC_parse - 1 > not_posix_region_end
17525                  && MAYBE_POSIXCC(value))
17526         {
17527             (void) handle_possible_posix(
17528                         pRExC_state,
17529                         RExC_parse - 1,  /* -1 because parse has already been
17530                                             advanced */
17531                         &not_posix_region_end,
17532                         do_posix_warnings ? &posix_warnings : NULL,
17533                         TRUE /* checking only */);
17534         }
17535         else if (  strict && ! skip_white
17536                  && (   _generic_isCC(value, _CC_VERTSPACE)
17537                      || is_VERTWS_cp_high(value)))
17538         {
17539             vFAIL("Literal vertical space in [] is illegal except under /x");
17540         }
17541         else if (value == '\\') {
17542             /* Is a backslash; get the code point of the char after it */
17543
17544             if (RExC_parse >= RExC_end) {
17545                 vFAIL("Unmatched [");
17546             }
17547
17548             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17549                 value = utf8n_to_uvchr((U8*)RExC_parse,
17550                                    RExC_end - RExC_parse,
17551                                    &numlen, UTF8_ALLOW_DEFAULT);
17552                 RExC_parse += numlen;
17553             }
17554             else
17555                 value = UCHARAT(RExC_parse++);
17556
17557             /* Some compilers cannot handle switching on 64-bit integer
17558              * values, therefore value cannot be an UV.  Yes, this will
17559              * be a problem later if we want switch on Unicode.
17560              * A similar issue a little bit later when switching on
17561              * namedclass. --jhi */
17562
17563             /* If the \ is escaping white space when white space is being
17564              * skipped, it means that that white space is wanted literally, and
17565              * is already in 'value'.  Otherwise, need to translate the escape
17566              * into what it signifies. */
17567             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17568                 const char * message;
17569                 U32 packed_warn;
17570                 U8 grok_c_char;
17571
17572             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
17573             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
17574             case 's':   namedclass = ANYOF_SPACE;       break;
17575             case 'S':   namedclass = ANYOF_NSPACE;      break;
17576             case 'd':   namedclass = ANYOF_DIGIT;       break;
17577             case 'D':   namedclass = ANYOF_NDIGIT;      break;
17578             case 'v':   namedclass = ANYOF_VERTWS;      break;
17579             case 'V':   namedclass = ANYOF_NVERTWS;     break;
17580             case 'h':   namedclass = ANYOF_HORIZWS;     break;
17581             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
17582             case 'N':  /* Handle \N{NAME} in class */
17583                 {
17584                     const char * const backslash_N_beg = RExC_parse - 2;
17585                     int cp_count;
17586
17587                     if (! grok_bslash_N(pRExC_state,
17588                                         NULL,      /* No regnode */
17589                                         &value,    /* Yes single value */
17590                                         &cp_count, /* Multiple code pt count */
17591                                         flagp,
17592                                         strict,
17593                                         depth)
17594                     ) {
17595
17596                         if (*flagp & NEED_UTF8)
17597                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17598
17599                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17600
17601                         if (cp_count < 0) {
17602                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17603                         }
17604                         else if (cp_count == 0) {
17605                             ckWARNreg(RExC_parse,
17606                               "Ignoring zero length \\N{} in character class");
17607                         }
17608                         else { /* cp_count > 1 */
17609                             assert(cp_count > 1);
17610                             if (! RExC_in_multi_char_class) {
17611                                 if ( ! allow_mutiple_chars
17612                                     || invert
17613                                     || range
17614                                     || *RExC_parse == '-')
17615                                 {
17616                                     if (strict) {
17617                                         RExC_parse--;
17618                                         vFAIL("\\N{} here is restricted to one character");
17619                                     }
17620                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17621                                     break; /* <value> contains the first code
17622                                               point. Drop out of the switch to
17623                                               process it */
17624                                 }
17625                                 else {
17626                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17627                                                  RExC_parse - backslash_N_beg);
17628                                     multi_char_matches
17629                                         = add_multi_match(multi_char_matches,
17630                                                           multi_char_N,
17631                                                           cp_count);
17632                                 }
17633                             }
17634                         } /* End of cp_count != 1 */
17635
17636                         /* This element should not be processed further in this
17637                          * class */
17638                         element_count--;
17639                         value = save_value;
17640                         prevvalue = save_prevvalue;
17641                         continue;   /* Back to top of loop to get next char */
17642                     }
17643
17644                     /* Here, is a single code point, and <value> contains it */
17645                     unicode_range = TRUE;   /* \N{} are Unicode */
17646                 }
17647                 break;
17648             case 'p':
17649             case 'P':
17650                 {
17651                 char *e;
17652
17653                 if (RExC_pm_flags & PMf_WILDCARD) {
17654                     RExC_parse++;
17655                     /* diag_listed_as: Use of %s is not allowed in Unicode
17656                        property wildcard subpatterns in regex; marked by <--
17657                        HERE in m/%s/ */
17658                     vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17659                            " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17660                 }
17661
17662                 /* \p means they want Unicode semantics */
17663                 REQUIRE_UNI_RULES(flagp, 0);
17664
17665                 if (RExC_parse >= RExC_end)
17666                     vFAIL2("Empty \\%c", (U8)value);
17667                 if (*RExC_parse == '{') {
17668                     const U8 c = (U8)value;
17669                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17670                     if (!e) {
17671                         RExC_parse++;
17672                         vFAIL2("Missing right brace on \\%c{}", c);
17673                     }
17674
17675                     RExC_parse++;
17676
17677                     /* White space is allowed adjacent to the braces and after
17678                      * any '^', even when not under /x */
17679                     while (isSPACE(*RExC_parse)) {
17680                          RExC_parse++;
17681                     }
17682
17683                     if (UCHARAT(RExC_parse) == '^') {
17684
17685                         /* toggle.  (The rhs xor gets the single bit that
17686                          * differs between P and p; the other xor inverts just
17687                          * that bit) */
17688                         value ^= 'P' ^ 'p';
17689
17690                         RExC_parse++;
17691                         while (isSPACE(*RExC_parse)) {
17692                             RExC_parse++;
17693                         }
17694                     }
17695
17696                     if (e == RExC_parse)
17697                         vFAIL2("Empty \\%c{}", c);
17698
17699                     n = e - RExC_parse;
17700                     while (isSPACE(*(RExC_parse + n - 1)))
17701                         n--;
17702
17703                 }   /* The \p isn't immediately followed by a '{' */
17704                 else if (! isALPHA(*RExC_parse)) {
17705                     RExC_parse += (UTF)
17706                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17707                                   : 1;
17708                     vFAIL2("Character following \\%c must be '{' or a "
17709                            "single-character Unicode property name",
17710                            (U8) value);
17711                 }
17712                 else {
17713                     e = RExC_parse;
17714                     n = 1;
17715                 }
17716                 {
17717                     char* name = RExC_parse;
17718
17719                     /* Any message returned about expanding the definition */
17720                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17721
17722                     /* If set TRUE, the property is user-defined as opposed to
17723                      * official Unicode */
17724                     bool user_defined = FALSE;
17725                     AV * strings = NULL;
17726
17727                     SV * prop_definition = parse_uniprop_string(
17728                                             name, n, UTF, FOLD,
17729                                             FALSE, /* This is compile-time */
17730
17731                                             /* We can't defer this defn when
17732                                              * the full result is required in
17733                                              * this call */
17734                                             ! cBOOL(ret_invlist),
17735
17736                                             &strings,
17737                                             &user_defined,
17738                                             msg,
17739                                             0 /* Base level */
17740                                            );
17741                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17742                         assert(prop_definition == NULL);
17743                         RExC_parse = e + 1;
17744                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17745                                                thing so, or else the display is
17746                                                mojibake */
17747                             RExC_utf8 = TRUE;
17748                         }
17749                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17750                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17751                                     SvCUR(msg), SvPVX(msg)));
17752                     }
17753
17754                     assert(prop_definition || strings);
17755
17756                     if (strings) {
17757                         if (ret_invlist) {
17758                             if (! prop_definition) {
17759                                 RExC_parse = e + 1;
17760                                 vFAIL("Unicode string properties are not implemented in (?[...])");
17761                             }
17762                             else {
17763                                 ckWARNreg(e + 1,
17764                                     "Using just the single character results"
17765                                     " returned by \\p{} in (?[...])");
17766                             }
17767                         }
17768                         else if (! RExC_in_multi_char_class) {
17769                             if (invert ^ (value == 'P')) {
17770                                 RExC_parse = e + 1;
17771                                 vFAIL("Inverting a character class which contains"
17772                                     " a multi-character sequence is illegal");
17773                             }
17774
17775                             /* For each multi-character string ... */
17776                             while (av_count(strings) > 0) {
17777                                 /* ... Each entry is itself an array of code
17778                                 * points. */
17779                                 AV * this_string = (AV *) av_shift( strings);
17780                                 STRLEN cp_count = av_count(this_string);
17781                                 SV * final = newSV(cp_count * 4);
17782                                 SvPVCLEAR(final);
17783
17784                                 /* Create another string of sequences of \x{...} */
17785                                 while (av_count(this_string) > 0) {
17786                                     SV * character = av_shift(this_string);
17787                                     UV cp = SvUV(character);
17788
17789                                     if (cp > 255) {
17790                                         REQUIRE_UTF8(flagp);
17791                                     }
17792                                     Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17793                                                                         cp);
17794                                     SvREFCNT_dec_NN(character);
17795                                 }
17796                                 SvREFCNT_dec_NN(this_string);
17797
17798                                 /* And add that to the list of such things */
17799                                 multi_char_matches
17800                                             = add_multi_match(multi_char_matches,
17801                                                             final,
17802                                                             cp_count);
17803                             }
17804                         }
17805                         SvREFCNT_dec_NN(strings);
17806                     }
17807
17808                     if (! prop_definition) {    /* If we got only a string,
17809                                                    this iteration didn't really
17810                                                    find a character */
17811                         element_count--;
17812                     }
17813                     else if (! is_invlist(prop_definition)) {
17814
17815                         /* Here, the definition isn't known, so we have gotten
17816                          * returned a string that will be evaluated if and when
17817                          * encountered at runtime.  We add it to the list of
17818                          * such properties, along with whether it should be
17819                          * complemented or not */
17820                         if (value == 'P') {
17821                             sv_catpvs(listsv, "!");
17822                         }
17823                         else {
17824                             sv_catpvs(listsv, "+");
17825                         }
17826                         sv_catsv(listsv, prop_definition);
17827
17828                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17829
17830                         /* We don't know yet what this matches, so have to flag
17831                          * it */
17832                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17833                     }
17834                     else {
17835                         assert (prop_definition && is_invlist(prop_definition));
17836
17837                         /* Here we do have the complete property definition
17838                          *
17839                          * Temporary workaround for [perl #133136].  For this
17840                          * precise input that is in the .t that is failing,
17841                          * load utf8.pm, which is what the test wants, so that
17842                          * that .t passes */
17843                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17844                                         "foo\\p{Alnum}")
17845                             && ! hv_common(GvHVn(PL_incgv),
17846                                            NULL,
17847                                            "utf8.pm", sizeof("utf8.pm") - 1,
17848                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17849                         {
17850                             require_pv("utf8.pm");
17851                         }
17852
17853                         if (! user_defined &&
17854                             /* We warn on matching an above-Unicode code point
17855                              * if the match would return true, except don't
17856                              * warn for \p{All}, which has exactly one element
17857                              * = 0 */
17858                             (_invlist_contains_cp(prop_definition, 0x110000)
17859                                 && (! (_invlist_len(prop_definition) == 1
17860                                        && *invlist_array(prop_definition) == 0))))
17861                         {
17862                             warn_super = TRUE;
17863                         }
17864
17865                         /* Invert if asking for the complement */
17866                         if (value == 'P') {
17867                             _invlist_union_complement_2nd(properties,
17868                                                           prop_definition,
17869                                                           &properties);
17870                         }
17871                         else {
17872                             _invlist_union(properties, prop_definition, &properties);
17873                         }
17874                     }
17875                 }
17876
17877                 RExC_parse = e + 1;
17878                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17879                                                 named */
17880                 }
17881                 break;
17882             case 'n':   value = '\n';                   break;
17883             case 'r':   value = '\r';                   break;
17884             case 't':   value = '\t';                   break;
17885             case 'f':   value = '\f';                   break;
17886             case 'b':   value = '\b';                   break;
17887             case 'e':   value = ESC_NATIVE;             break;
17888             case 'a':   value = '\a';                   break;
17889             case 'o':
17890                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17891                 if (! grok_bslash_o(&RExC_parse,
17892                                             RExC_end,
17893                                             &value,
17894                                             &message,
17895                                             &packed_warn,
17896                                             strict,
17897                                             cBOOL(range), /* MAX_UV allowed for range
17898                                                       upper limit */
17899                                             UTF))
17900                 {
17901                     vFAIL(message);
17902                 }
17903                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17904                     warn_non_literal_string(RExC_parse, packed_warn, message);
17905                 }
17906
17907                 if (value < 256) {
17908                     non_portable_endpoint++;
17909                 }
17910                 break;
17911             case 'x':
17912                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17913                 if (!  grok_bslash_x(&RExC_parse,
17914                                             RExC_end,
17915                                             &value,
17916                                             &message,
17917                                             &packed_warn,
17918                                             strict,
17919                                             cBOOL(range), /* MAX_UV allowed for range
17920                                                       upper limit */
17921                                             UTF))
17922                 {
17923                     vFAIL(message);
17924                 }
17925                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17926                     warn_non_literal_string(RExC_parse, packed_warn, message);
17927                 }
17928
17929                 if (value < 256) {
17930                     non_portable_endpoint++;
17931                 }
17932                 break;
17933             case 'c':
17934                 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17935                                                                 &packed_warn))
17936                 {
17937                     /* going to die anyway; point to exact spot of
17938                         * failure */
17939                     RExC_parse += (UTF)
17940                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17941                                   : 1;
17942                     vFAIL(message);
17943                 }
17944
17945                 value = grok_c_char;
17946                 RExC_parse++;
17947                 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17948                     warn_non_literal_string(RExC_parse, packed_warn, message);
17949                 }
17950
17951                 non_portable_endpoint++;
17952                 break;
17953             case '0': case '1': case '2': case '3': case '4':
17954             case '5': case '6': case '7':
17955                 {
17956                     /* Take 1-3 octal digits */
17957                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17958                               | PERL_SCAN_NOTIFY_ILLDIGIT;
17959                     numlen = (strict) ? 4 : 3;
17960                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17961                     RExC_parse += numlen;
17962                     if (numlen != 3) {
17963                         if (strict) {
17964                             RExC_parse += (UTF)
17965                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17966                                           : 1;
17967                             vFAIL("Need exactly 3 octal digits");
17968                         }
17969                         else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17970                                  && RExC_parse < RExC_end
17971                                  && isDIGIT(*RExC_parse)
17972                                  && ckWARN(WARN_REGEXP))
17973                         {
17974                             reg_warn_non_literal_string(
17975                                  RExC_parse + 1,
17976                                  form_alien_digit_msg(8, numlen, RExC_parse,
17977                                                         RExC_end, UTF, FALSE));
17978                         }
17979                     }
17980                     if (value < 256) {
17981                         non_portable_endpoint++;
17982                     }
17983                     break;
17984                 }
17985             default:
17986                 /* Allow \_ to not give an error */
17987                 if (isWORDCHAR(value) && value != '_') {
17988                     if (strict) {
17989                         vFAIL2("Unrecognized escape \\%c in character class",
17990                                (int)value);
17991                     }
17992                     else {
17993                         ckWARN2reg(RExC_parse,
17994                             "Unrecognized escape \\%c in character class passed through",
17995                             (int)value);
17996                     }
17997                 }
17998                 break;
17999             }   /* End of switch on char following backslash */
18000         } /* end of handling backslash escape sequences */
18001
18002         /* Here, we have the current token in 'value' */
18003
18004         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18005             U8 classnum;
18006
18007             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
18008              * literal, as is the character that began the false range, i.e.
18009              * the 'a' in the examples */
18010             if (range) {
18011                 const int w = (RExC_parse >= rangebegin)
18012                                 ? RExC_parse - rangebegin
18013                                 : 0;
18014                 if (strict) {
18015                     vFAIL2utf8f(
18016                         "False [] range \"%" UTF8f "\"",
18017                         UTF8fARG(UTF, w, rangebegin));
18018                 }
18019                 else {
18020                     ckWARN2reg(RExC_parse,
18021                         "False [] range \"%" UTF8f "\"",
18022                         UTF8fARG(UTF, w, rangebegin));
18023                     cp_list = add_cp_to_invlist(cp_list, '-');
18024                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18025                                                             prevvalue);
18026                 }
18027
18028                 range = 0; /* this was not a true range */
18029                 element_count += 2; /* So counts for three values */
18030             }
18031
18032             classnum = namedclass_to_classnum(namedclass);
18033
18034             if (LOC && namedclass < ANYOF_POSIXL_MAX
18035 #ifndef HAS_ISASCII
18036                 && classnum != _CC_ASCII
18037 #endif
18038             ) {
18039                 SV* scratch_list = NULL;
18040
18041                 /* What the Posix classes (like \w, [:space:]) match isn't
18042                  * generally knowable under locale until actual match time.  A
18043                  * special node is used for these which has extra space for a
18044                  * bitmap, with a bit reserved for each named class that is to
18045                  * be matched against.  (This isn't needed for \p{} and
18046                  * pseudo-classes, as they are not affected by locale, and
18047                  * hence are dealt with separately.)  However, if a named class
18048                  * and its complement are both present, then it matches
18049                  * everything, and there is no runtime dependency.  Odd numbers
18050                  * are the complements of the next lower number, so xor works.
18051                  * (Note that something like [\w\D] should match everything,
18052                  * because \d should be a proper subset of \w.  But rather than
18053                  * trust that the locale is well behaved, we leave this to
18054                  * runtime to sort out) */
18055                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18056                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18057                     POSIXL_ZERO(posixl);
18058                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18059                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18060                     continue;   /* We could ignore the rest of the class, but
18061                                    best to parse it for any errors */
18062                 }
18063                 else { /* Here, isn't the complement of any already parsed
18064                           class */
18065                     POSIXL_SET(posixl, namedclass);
18066                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18067                     anyof_flags |= ANYOF_MATCHES_POSIXL;
18068
18069                     /* The above-Latin1 characters are not subject to locale
18070                      * rules.  Just add them to the unconditionally-matched
18071                      * list */
18072
18073                     /* Get the list of the above-Latin1 code points this
18074                      * matches */
18075                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18076                                             PL_XPosix_ptrs[classnum],
18077
18078                                             /* Odd numbers are complements,
18079                                              * like NDIGIT, NASCII, ... */
18080                                             namedclass % 2 != 0,
18081                                             &scratch_list);
18082                     /* Checking if 'cp_list' is NULL first saves an extra
18083                      * clone.  Its reference count will be decremented at the
18084                      * next union, etc, or if this is the only instance, at the
18085                      * end of the routine */
18086                     if (! cp_list) {
18087                         cp_list = scratch_list;
18088                     }
18089                     else {
18090                         _invlist_union(cp_list, scratch_list, &cp_list);
18091                         SvREFCNT_dec_NN(scratch_list);
18092                     }
18093                     continue;   /* Go get next character */
18094                 }
18095             }
18096             else {
18097
18098                 /* Here, is not /l, or is a POSIX class for which /l doesn't
18099                  * matter (or is a Unicode property, which is skipped here). */
18100                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
18101                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18102
18103                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
18104                          * nor /l make a difference in what these match,
18105                          * therefore we just add what they match to cp_list. */
18106                         if (classnum != _CC_VERTSPACE) {
18107                             assert(   namedclass == ANYOF_HORIZWS
18108                                    || namedclass == ANYOF_NHORIZWS);
18109
18110                             /* It turns out that \h is just a synonym for
18111                              * XPosixBlank */
18112                             classnum = _CC_BLANK;
18113                         }
18114
18115                         _invlist_union_maybe_complement_2nd(
18116                                 cp_list,
18117                                 PL_XPosix_ptrs[classnum],
18118                                 namedclass % 2 != 0,    /* Complement if odd
18119                                                           (NHORIZWS, NVERTWS)
18120                                                         */
18121                                 &cp_list);
18122                     }
18123                 }
18124                 else if (   AT_LEAST_UNI_SEMANTICS
18125                          || classnum == _CC_ASCII
18126                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
18127                                                    || classnum == _CC_XDIGIT)))
18128                 {
18129                     /* We usually have to worry about /d affecting what POSIX
18130                      * classes match, with special code needed because we won't
18131                      * know until runtime what all matches.  But there is no
18132                      * extra work needed under /u and /a; and [:ascii:] is
18133                      * unaffected by /d; and :digit: and :xdigit: don't have
18134                      * runtime differences under /d.  So we can special case
18135                      * these, and avoid some extra work below, and at runtime.
18136                      * */
18137                     _invlist_union_maybe_complement_2nd(
18138                                                      simple_posixes,
18139                                                       ((AT_LEAST_ASCII_RESTRICTED)
18140                                                        ? PL_Posix_ptrs[classnum]
18141                                                        : PL_XPosix_ptrs[classnum]),
18142                                                      namedclass % 2 != 0,
18143                                                      &simple_posixes);
18144                 }
18145                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
18146                            complement and use nposixes */
18147                     SV** posixes_ptr = namedclass % 2 == 0
18148                                        ? &posixes
18149                                        : &nposixes;
18150                     _invlist_union_maybe_complement_2nd(
18151                                                      *posixes_ptr,
18152                                                      PL_XPosix_ptrs[classnum],
18153                                                      namedclass % 2 != 0,
18154                                                      posixes_ptr);
18155                 }
18156             }
18157         } /* end of namedclass \blah */
18158
18159         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18160
18161         /* If 'range' is set, 'value' is the ending of a range--check its
18162          * validity.  (If value isn't a single code point in the case of a
18163          * range, we should have figured that out above in the code that
18164          * catches false ranges).  Later, we will handle each individual code
18165          * point in the range.  If 'range' isn't set, this could be the
18166          * beginning of a range, so check for that by looking ahead to see if
18167          * the next real character to be processed is the range indicator--the
18168          * minus sign */
18169
18170         if (range) {
18171 #ifdef EBCDIC
18172             /* For unicode ranges, we have to test that the Unicode as opposed
18173              * to the native values are not decreasing.  (Above 255, there is
18174              * no difference between native and Unicode) */
18175             if (unicode_range && prevvalue < 255 && value < 255) {
18176                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18177                     goto backwards_range;
18178                 }
18179             }
18180             else
18181 #endif
18182             if (prevvalue > value) /* b-a */ {
18183                 int w;
18184 #ifdef EBCDIC
18185               backwards_range:
18186 #endif
18187                 w = RExC_parse - rangebegin;
18188                 vFAIL2utf8f(
18189                     "Invalid [] range \"%" UTF8f "\"",
18190                     UTF8fARG(UTF, w, rangebegin));
18191                 NOT_REACHED; /* NOTREACHED */
18192             }
18193         }
18194         else {
18195             prevvalue = value; /* save the beginning of the potential range */
18196             if (! stop_at_1     /* Can't be a range if parsing just one thing */
18197                 && *RExC_parse == '-')
18198             {
18199                 char* next_char_ptr = RExC_parse + 1;
18200
18201                 /* Get the next real char after the '-' */
18202                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18203
18204                 /* If the '-' is at the end of the class (just before the ']',
18205                  * it is a literal minus; otherwise it is a range */
18206                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18207                     RExC_parse = next_char_ptr;
18208
18209                     /* a bad range like \w-, [:word:]- ? */
18210                     if (namedclass > OOB_NAMEDCLASS) {
18211                         if (strict || ckWARN(WARN_REGEXP)) {
18212                             const int w = RExC_parse >= rangebegin
18213                                           ?  RExC_parse - rangebegin
18214                                           : 0;
18215                             if (strict) {
18216                                 vFAIL4("False [] range \"%*.*s\"",
18217                                     w, w, rangebegin);
18218                             }
18219                             else {
18220                                 vWARN4(RExC_parse,
18221                                     "False [] range \"%*.*s\"",
18222                                     w, w, rangebegin);
18223                             }
18224                         }
18225                         cp_list = add_cp_to_invlist(cp_list, '-');
18226                         element_count++;
18227                     } else
18228                         range = 1;      /* yeah, it's a range! */
18229                     continue;   /* but do it the next time */
18230                 }
18231             }
18232         }
18233
18234         if (namedclass > OOB_NAMEDCLASS) {
18235             continue;
18236         }
18237
18238         /* Here, we have a single value this time through the loop, and
18239          * <prevvalue> is the beginning of the range, if any; or <value> if
18240          * not. */
18241
18242         /* non-Latin1 code point implies unicode semantics. */
18243         if (value > 255) {
18244             if (value > MAX_LEGAL_CP && (   value != UV_MAX
18245                                          || prevvalue > MAX_LEGAL_CP))
18246             {
18247                 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18248             }
18249             REQUIRE_UNI_RULES(flagp, 0);
18250             if (  ! silence_non_portable
18251                 &&  UNICODE_IS_PERL_EXTENDED(value)
18252                 &&  TO_OUTPUT_WARNINGS(RExC_parse))
18253             {
18254                 ckWARN2_non_literal_string(RExC_parse,
18255                                            packWARN(WARN_PORTABLE),
18256                                            PL_extended_cp_format,
18257                                            value);
18258             }
18259         }
18260
18261         /* Ready to process either the single value, or the completed range.
18262          * For single-valued non-inverted ranges, we consider the possibility
18263          * of multi-char folds.  (We made a conscious decision to not do this
18264          * for the other cases because it can often lead to non-intuitive
18265          * results.  For example, you have the peculiar case that:
18266          *  "s s" =~ /^[^\xDF]+$/i => Y
18267          *  "ss"  =~ /^[^\xDF]+$/i => N
18268          *
18269          * See [perl #89750] */
18270         if (FOLD && allow_mutiple_chars && value == prevvalue) {
18271             if (    value == LATIN_SMALL_LETTER_SHARP_S
18272                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18273                                                         value)))
18274             {
18275                 /* Here <value> is indeed a multi-char fold.  Get what it is */
18276
18277                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18278                 STRLEN foldlen;
18279
18280                 UV folded = _to_uni_fold_flags(
18281                                 value,
18282                                 foldbuf,
18283                                 &foldlen,
18284                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18285                                                    ? FOLD_FLAGS_NOMIX_ASCII
18286                                                    : 0)
18287                                 );
18288
18289                 /* Here, <folded> should be the first character of the
18290                  * multi-char fold of <value>, with <foldbuf> containing the
18291                  * whole thing.  But, if this fold is not allowed (because of
18292                  * the flags), <fold> will be the same as <value>, and should
18293                  * be processed like any other character, so skip the special
18294                  * handling */
18295                 if (folded != value) {
18296
18297                     /* Skip if we are recursed, currently parsing the class
18298                      * again.  Otherwise add this character to the list of
18299                      * multi-char folds. */
18300                     if (! RExC_in_multi_char_class) {
18301                         STRLEN cp_count = utf8_length(foldbuf,
18302                                                       foldbuf + foldlen);
18303                         SV* multi_fold = sv_2mortal(newSVpvs(""));
18304
18305                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18306
18307                         multi_char_matches
18308                                         = add_multi_match(multi_char_matches,
18309                                                           multi_fold,
18310                                                           cp_count);
18311
18312                     }
18313
18314                     /* This element should not be processed further in this
18315                      * class */
18316                     element_count--;
18317                     value = save_value;
18318                     prevvalue = save_prevvalue;
18319                     continue;
18320                 }
18321             }
18322         }
18323
18324         if (strict && ckWARN(WARN_REGEXP)) {
18325             if (range) {
18326
18327                 /* If the range starts above 255, everything is portable and
18328                  * likely to be so for any forseeable character set, so don't
18329                  * warn. */
18330                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18331                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18332                 }
18333                 else if (prevvalue != value) {
18334
18335                     /* Under strict, ranges that stop and/or end in an ASCII
18336                      * printable should have each end point be a portable value
18337                      * for it (preferably like 'A', but we don't warn if it is
18338                      * a (portable) Unicode name or code point), and the range
18339                      * must be all digits or all letters of the same case.
18340                      * Otherwise, the range is non-portable and unclear as to
18341                      * what it contains */
18342                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
18343                         && (          non_portable_endpoint
18344                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18345                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
18346                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
18347                     ))) {
18348                         vWARN(RExC_parse, "Ranges of ASCII printables should"
18349                                           " be some subset of \"0-9\","
18350                                           " \"A-Z\", or \"a-z\"");
18351                     }
18352                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18353                         SSize_t index_start;
18354                         SSize_t index_final;
18355
18356                         /* But the nature of Unicode and languages mean we
18357                          * can't do the same checks for above-ASCII ranges,
18358                          * except in the case of digit ones.  These should
18359                          * contain only digits from the same group of 10.  The
18360                          * ASCII case is handled just above.  Hence here, the
18361                          * range could be a range of digits.  First some
18362                          * unlikely special cases.  Grandfather in that a range
18363                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18364                          * if its starting value is one of the 10 digits prior
18365                          * to it.  This is because it is an alternate way of
18366                          * writing 19D1, and some people may expect it to be in
18367                          * that group.  But it is bad, because it won't give
18368                          * the expected results.  In Unicode 5.2 it was
18369                          * considered to be in that group (of 11, hence), but
18370                          * this was fixed in the next version */
18371
18372                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18373                             goto warn_bad_digit_range;
18374                         }
18375                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
18376                                           &&     value <= 0x1D7FF))
18377                         {
18378                             /* This is the only other case currently in Unicode
18379                              * where the algorithm below fails.  The code
18380                              * points just above are the end points of a single
18381                              * range containing only decimal digits.  It is 5
18382                              * different series of 0-9.  All other ranges of
18383                              * digits currently in Unicode are just a single
18384                              * series.  (And mktables will notify us if a later
18385                              * Unicode version breaks this.)
18386                              *
18387                              * If the range being checked is at most 9 long,
18388                              * and the digit values represented are in
18389                              * numerical order, they are from the same series.
18390                              * */
18391                             if (         value - prevvalue > 9
18392                                 ||    (((    value - 0x1D7CE) % 10)
18393                                      <= (prevvalue - 0x1D7CE) % 10))
18394                             {
18395                                 goto warn_bad_digit_range;
18396                             }
18397                         }
18398                         else {
18399
18400                             /* For all other ranges of digits in Unicode, the
18401                              * algorithm is just to check if both end points
18402                              * are in the same series, which is the same range.
18403                              * */
18404                             index_start = _invlist_search(
18405                                                     PL_XPosix_ptrs[_CC_DIGIT],
18406                                                     prevvalue);
18407
18408                             /* Warn if the range starts and ends with a digit,
18409                              * and they are not in the same group of 10. */
18410                             if (   index_start >= 0
18411                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18412                                 && (index_final =
18413                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18414                                                     value)) != index_start
18415                                 && index_final >= 0
18416                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18417                             {
18418                               warn_bad_digit_range:
18419                                 vWARN(RExC_parse, "Ranges of digits should be"
18420                                                   " from the same group of"
18421                                                   " 10");
18422                             }
18423                         }
18424                     }
18425                 }
18426             }
18427             if ((! range || prevvalue == value) && non_portable_endpoint) {
18428                 if (isPRINT_A(value)) {
18429                     char literal[3];
18430                     unsigned d = 0;
18431                     if (isBACKSLASHED_PUNCT(value)) {
18432                         literal[d++] = '\\';
18433                     }
18434                     literal[d++] = (char) value;
18435                     literal[d++] = '\0';
18436
18437                     vWARN4(RExC_parse,
18438                            "\"%.*s\" is more clearly written simply as \"%s\"",
18439                            (int) (RExC_parse - rangebegin),
18440                            rangebegin,
18441                            literal
18442                         );
18443                 }
18444                 else if (isMNEMONIC_CNTRL(value)) {
18445                     vWARN4(RExC_parse,
18446                            "\"%.*s\" is more clearly written simply as \"%s\"",
18447                            (int) (RExC_parse - rangebegin),
18448                            rangebegin,
18449                            cntrl_to_mnemonic((U8) value)
18450                         );
18451                 }
18452             }
18453         }
18454
18455         /* Deal with this element of the class */
18456
18457 #ifndef EBCDIC
18458         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18459                                                     prevvalue, value);
18460 #else
18461         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18462          * that don't require special handling, we can just add the range like
18463          * we do for ASCII platforms */
18464         if ((UNLIKELY(prevvalue == 0) && value >= 255)
18465             || ! (prevvalue < 256
18466                     && (unicode_range
18467                         || (! non_portable_endpoint
18468                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18469                                 || (isUPPER_A(prevvalue)
18470                                     && isUPPER_A(value)))))))
18471         {
18472             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18473                                                         prevvalue, value);
18474         }
18475         else {
18476             /* Here, requires special handling.  This can be because it is a
18477              * range whose code points are considered to be Unicode, and so
18478              * must be individually translated into native, or because its a
18479              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18480              * EBCDIC, but we have defined them to include only the "expected"
18481              * upper or lower case ASCII alphabetics.  Subranges above 255 are
18482              * the same in native and Unicode, so can be added as a range */
18483             U8 start = NATIVE_TO_LATIN1(prevvalue);
18484             unsigned j;
18485             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18486             for (j = start; j <= end; j++) {
18487                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18488             }
18489             if (value > 255) {
18490                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18491                                                             256, value);
18492             }
18493         }
18494 #endif
18495
18496         range = 0; /* this range (if it was one) is done now */
18497     } /* End of loop through all the text within the brackets */
18498
18499     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18500         output_posix_warnings(pRExC_state, posix_warnings);
18501     }
18502
18503     /* If anything in the class expands to more than one character, we have to
18504      * deal with them by building up a substitute parse string, and recursively
18505      * calling reg() on it, instead of proceeding */
18506     if (multi_char_matches) {
18507         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18508         I32 cp_count;
18509         STRLEN len;
18510         char *save_end = RExC_end;
18511         char *save_parse = RExC_parse;
18512         char *save_start = RExC_start;
18513         Size_t constructed_prefix_len = 0; /* This gives the length of the
18514                                               constructed portion of the
18515                                               substitute parse. */
18516         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
18517                                        a "|" */
18518         I32 reg_flags;
18519
18520         assert(! invert);
18521         /* Only one level of recursion allowed */
18522         assert(RExC_copy_start_in_constructed == RExC_precomp);
18523
18524 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
18525            because too confusing */
18526         if (invert) {
18527             sv_catpvs(substitute_parse, "(?:");
18528         }
18529 #endif
18530
18531         /* Look at the longest strings first */
18532         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18533                         cp_count > 0;
18534                         cp_count--)
18535         {
18536
18537             if (av_exists(multi_char_matches, cp_count)) {
18538                 AV** this_array_ptr;
18539                 SV* this_sequence;
18540
18541                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18542                                                  cp_count, FALSE);
18543                 while ((this_sequence = av_pop(*this_array_ptr)) !=
18544                                                                 &PL_sv_undef)
18545                 {
18546                     if (! first_time) {
18547                         sv_catpvs(substitute_parse, "|");
18548                     }
18549                     first_time = FALSE;
18550
18551                     sv_catpv(substitute_parse, SvPVX(this_sequence));
18552                 }
18553             }
18554         }
18555
18556         /* If the character class contains anything else besides these
18557          * multi-character strings, have to include it in recursive parsing */
18558         if (element_count) {
18559             bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18560
18561             sv_catpvs(substitute_parse, "|");
18562             if (has_l_bracket) {    /* Add an [ if the original had one */
18563                 sv_catpvs(substitute_parse, "[");
18564             }
18565             constructed_prefix_len = SvCUR(substitute_parse);
18566             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18567
18568             /* Put in a closing ']' to match any opening one, but not if going
18569              * off the end, as otherwise we are adding something that really
18570              * isn't there */
18571             if (has_l_bracket && RExC_parse < RExC_end) {
18572                 sv_catpvs(substitute_parse, "]");
18573             }
18574         }
18575
18576         sv_catpvs(substitute_parse, ")");
18577 #if 0
18578         if (invert) {
18579             /* This is a way to get the parse to skip forward a whole named
18580              * sequence instead of matching the 2nd character when it fails the
18581              * first */
18582             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18583         }
18584 #endif
18585
18586         /* Set up the data structure so that any errors will be properly
18587          * reported.  See the comments at the definition of
18588          * REPORT_LOCATION_ARGS for details */
18589         RExC_copy_start_in_input = (char *) orig_parse;
18590         RExC_start = RExC_parse = SvPV(substitute_parse, len);
18591         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18592         RExC_end = RExC_parse + len;
18593         RExC_in_multi_char_class = 1;
18594
18595         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18596
18597         *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
18598
18599         /* And restore so can parse the rest of the pattern */
18600         RExC_parse = save_parse;
18601         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18602         RExC_end = save_end;
18603         RExC_in_multi_char_class = 0;
18604         SvREFCNT_dec_NN(multi_char_matches);
18605         return ret;
18606     }
18607
18608     /* If folding, we calculate all characters that could fold to or from the
18609      * ones already on the list */
18610     if (cp_foldable_list) {
18611         if (FOLD) {
18612             UV start, end;      /* End points of code point ranges */
18613
18614             SV* fold_intersection = NULL;
18615             SV** use_list;
18616
18617             /* Our calculated list will be for Unicode rules.  For locale
18618              * matching, we have to keep a separate list that is consulted at
18619              * runtime only when the locale indicates Unicode rules (and we
18620              * don't include potential matches in the ASCII/Latin1 range, as
18621              * any code point could fold to any other, based on the run-time
18622              * locale).   For non-locale, we just use the general list */
18623             if (LOC) {
18624                 use_list = &only_utf8_locale_list;
18625             }
18626             else {
18627                 use_list = &cp_list;
18628             }
18629
18630             /* Only the characters in this class that participate in folds need
18631              * be checked.  Get the intersection of this class and all the
18632              * possible characters that are foldable.  This can quickly narrow
18633              * down a large class */
18634             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18635                                   &fold_intersection);
18636
18637             /* Now look at the foldable characters in this class individually */
18638             invlist_iterinit(fold_intersection);
18639             while (invlist_iternext(fold_intersection, &start, &end)) {
18640                 UV j;
18641                 UV folded;
18642
18643                 /* Look at every character in the range */
18644                 for (j = start; j <= end; j++) {
18645                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18646                     STRLEN foldlen;
18647                     unsigned int k;
18648                     Size_t folds_count;
18649                     U32 first_fold;
18650                     const U32 * remaining_folds;
18651
18652                     if (j < 256) {
18653
18654                         /* Under /l, we don't know what code points below 256
18655                          * fold to, except we do know the MICRO SIGN folds to
18656                          * an above-255 character if the locale is UTF-8, so we
18657                          * add it to the special list (in *use_list)  Otherwise
18658                          * we know now what things can match, though some folds
18659                          * are valid under /d only if the target is UTF-8.
18660                          * Those go in a separate list */
18661                         if (      IS_IN_SOME_FOLD_L1(j)
18662                             && ! (LOC && j != MICRO_SIGN))
18663                         {
18664
18665                             /* ASCII is always matched; non-ASCII is matched
18666                              * only under Unicode rules (which could happen
18667                              * under /l if the locale is a UTF-8 one */
18668                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18669                                 *use_list = add_cp_to_invlist(*use_list,
18670                                                             PL_fold_latin1[j]);
18671                             }
18672                             else if (j != PL_fold_latin1[j]) {
18673                                 upper_latin1_only_utf8_matches
18674                                         = add_cp_to_invlist(
18675                                                 upper_latin1_only_utf8_matches,
18676                                                 PL_fold_latin1[j]);
18677                             }
18678                         }
18679
18680                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18681                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18682                         {
18683                             add_above_Latin1_folds(pRExC_state,
18684                                                    (U8) j,
18685                                                    use_list);
18686                         }
18687                         continue;
18688                     }
18689
18690                     /* Here is an above Latin1 character.  We don't have the
18691                      * rules hard-coded for it.  First, get its fold.  This is
18692                      * the simple fold, as the multi-character folds have been
18693                      * handled earlier and separated out */
18694                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18695                                                         (ASCII_FOLD_RESTRICTED)
18696                                                         ? FOLD_FLAGS_NOMIX_ASCII
18697                                                         : 0);
18698
18699                     /* Single character fold of above Latin1.  Add everything
18700                      * in its fold closure to the list that this node should
18701                      * match. */
18702                     folds_count = _inverse_folds(folded, &first_fold,
18703                                                     &remaining_folds);
18704                     for (k = 0; k <= folds_count; k++) {
18705                         UV c = (k == 0)     /* First time through use itself */
18706                                 ? folded
18707                                 : (k == 1)  /* 2nd time use, the first fold */
18708                                    ? first_fold
18709
18710                                      /* Then the remaining ones */
18711                                    : remaining_folds[k-2];
18712
18713                         /* /aa doesn't allow folds between ASCII and non- */
18714                         if ((   ASCII_FOLD_RESTRICTED
18715                             && (isASCII(c) != isASCII(j))))
18716                         {
18717                             continue;
18718                         }
18719
18720                         /* Folds under /l which cross the 255/256 boundary are
18721                          * added to a separate list.  (These are valid only
18722                          * when the locale is UTF-8.) */
18723                         if (c < 256 && LOC) {
18724                             *use_list = add_cp_to_invlist(*use_list, c);
18725                             continue;
18726                         }
18727
18728                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18729                         {
18730                             cp_list = add_cp_to_invlist(cp_list, c);
18731                         }
18732                         else {
18733                             /* Similarly folds involving non-ascii Latin1
18734                              * characters under /d are added to their list */
18735                             upper_latin1_only_utf8_matches
18736                                     = add_cp_to_invlist(
18737                                                 upper_latin1_only_utf8_matches,
18738                                                 c);
18739                         }
18740                     }
18741                 }
18742             }
18743             SvREFCNT_dec_NN(fold_intersection);
18744         }
18745
18746         /* Now that we have finished adding all the folds, there is no reason
18747          * to keep the foldable list separate */
18748         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18749         SvREFCNT_dec_NN(cp_foldable_list);
18750     }
18751
18752     /* And combine the result (if any) with any inversion lists from posix
18753      * classes.  The lists are kept separate up to now because we don't want to
18754      * fold the classes */
18755     if (simple_posixes) {   /* These are the classes known to be unaffected by
18756                                /a, /aa, and /d */
18757         if (cp_list) {
18758             _invlist_union(cp_list, simple_posixes, &cp_list);
18759             SvREFCNT_dec_NN(simple_posixes);
18760         }
18761         else {
18762             cp_list = simple_posixes;
18763         }
18764     }
18765     if (posixes || nposixes) {
18766         if (! DEPENDS_SEMANTICS) {
18767
18768             /* For everything but /d, we can just add the current 'posixes' and
18769              * 'nposixes' to the main list */
18770             if (posixes) {
18771                 if (cp_list) {
18772                     _invlist_union(cp_list, posixes, &cp_list);
18773                     SvREFCNT_dec_NN(posixes);
18774                 }
18775                 else {
18776                     cp_list = posixes;
18777                 }
18778             }
18779             if (nposixes) {
18780                 if (cp_list) {
18781                     _invlist_union(cp_list, nposixes, &cp_list);
18782                     SvREFCNT_dec_NN(nposixes);
18783                 }
18784                 else {
18785                     cp_list = nposixes;
18786                 }
18787             }
18788         }
18789         else {
18790             /* Under /d, things like \w match upper Latin1 characters only if
18791              * the target string is in UTF-8.  But things like \W match all the
18792              * upper Latin1 characters if the target string is not in UTF-8.
18793              *
18794              * Handle the case with something like \W separately */
18795             if (nposixes) {
18796                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18797
18798                 /* A complemented posix class matches all upper Latin1
18799                  * characters if not in UTF-8.  And it matches just certain
18800                  * ones when in UTF-8.  That means those certain ones are
18801                  * matched regardless, so can just be added to the
18802                  * unconditional list */
18803                 if (cp_list) {
18804                     _invlist_union(cp_list, nposixes, &cp_list);
18805                     SvREFCNT_dec_NN(nposixes);
18806                     nposixes = NULL;
18807                 }
18808                 else {
18809                     cp_list = nposixes;
18810                 }
18811
18812                 /* Likewise for 'posixes' */
18813                 _invlist_union(posixes, cp_list, &cp_list);
18814                 SvREFCNT_dec(posixes);
18815
18816                 /* Likewise for anything else in the range that matched only
18817                  * under UTF-8 */
18818                 if (upper_latin1_only_utf8_matches) {
18819                     _invlist_union(cp_list,
18820                                    upper_latin1_only_utf8_matches,
18821                                    &cp_list);
18822                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18823                     upper_latin1_only_utf8_matches = NULL;
18824                 }
18825
18826                 /* If we don't match all the upper Latin1 characters regardless
18827                  * of UTF-8ness, we have to set a flag to match the rest when
18828                  * not in UTF-8 */
18829                 _invlist_subtract(only_non_utf8_list, cp_list,
18830                                   &only_non_utf8_list);
18831                 if (_invlist_len(only_non_utf8_list) != 0) {
18832                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18833                 }
18834                 SvREFCNT_dec_NN(only_non_utf8_list);
18835             }
18836             else {
18837                 /* Here there were no complemented posix classes.  That means
18838                  * the upper Latin1 characters in 'posixes' match only when the
18839                  * target string is in UTF-8.  So we have to add them to the
18840                  * list of those types of code points, while adding the
18841                  * remainder to the unconditional list.
18842                  *
18843                  * First calculate what they are */
18844                 SV* nonascii_but_latin1_properties = NULL;
18845                 _invlist_intersection(posixes, PL_UpperLatin1,
18846                                       &nonascii_but_latin1_properties);
18847
18848                 /* And add them to the final list of such characters. */
18849                 _invlist_union(upper_latin1_only_utf8_matches,
18850                                nonascii_but_latin1_properties,
18851                                &upper_latin1_only_utf8_matches);
18852
18853                 /* Remove them from what now becomes the unconditional list */
18854                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18855                                   &posixes);
18856
18857                 /* And add those unconditional ones to the final list */
18858                 if (cp_list) {
18859                     _invlist_union(cp_list, posixes, &cp_list);
18860                     SvREFCNT_dec_NN(posixes);
18861                     posixes = NULL;
18862                 }
18863                 else {
18864                     cp_list = posixes;
18865                 }
18866
18867                 SvREFCNT_dec(nonascii_but_latin1_properties);
18868
18869                 /* Get rid of any characters from the conditional list that we
18870                  * now know are matched unconditionally, which may make that
18871                  * list empty */
18872                 _invlist_subtract(upper_latin1_only_utf8_matches,
18873                                   cp_list,
18874                                   &upper_latin1_only_utf8_matches);
18875                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18876                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18877                     upper_latin1_only_utf8_matches = NULL;
18878                 }
18879             }
18880         }
18881     }
18882
18883     /* And combine the result (if any) with any inversion list from properties.
18884      * The lists are kept separate up to now so that we can distinguish the two
18885      * in regards to matching above-Unicode.  A run-time warning is generated
18886      * if a Unicode property is matched against a non-Unicode code point. But,
18887      * we allow user-defined properties to match anything, without any warning,
18888      * and we also suppress the warning if there is a portion of the character
18889      * class that isn't a Unicode property, and which matches above Unicode, \W
18890      * or [\x{110000}] for example.
18891      * (Note that in this case, unlike the Posix one above, there is no
18892      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18893      * forces Unicode semantics */
18894     if (properties) {
18895         if (cp_list) {
18896
18897             /* If it matters to the final outcome, see if a non-property
18898              * component of the class matches above Unicode.  If so, the
18899              * warning gets suppressed.  This is true even if just a single
18900              * such code point is specified, as, though not strictly correct if
18901              * another such code point is matched against, the fact that they
18902              * are using above-Unicode code points indicates they should know
18903              * the issues involved */
18904             if (warn_super) {
18905                 warn_super = ! (invert
18906                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18907             }
18908
18909             _invlist_union(properties, cp_list, &cp_list);
18910             SvREFCNT_dec_NN(properties);
18911         }
18912         else {
18913             cp_list = properties;
18914         }
18915
18916         if (warn_super) {
18917             anyof_flags
18918              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18919
18920             /* Because an ANYOF node is the only one that warns, this node
18921              * can't be optimized into something else */
18922             optimizable = FALSE;
18923         }
18924     }
18925
18926     /* Here, we have calculated what code points should be in the character
18927      * class.
18928      *
18929      * Now we can see about various optimizations.  Fold calculation (which we
18930      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18931      * would invert to include K, which under /i would match k, which it
18932      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18933      * folded until runtime */
18934
18935     /* If we didn't do folding, it's because some information isn't available
18936      * until runtime; set the run-time fold flag for these  We know to set the
18937      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18938      * at least one 0-255 range code point */
18939     if (LOC && FOLD) {
18940
18941         /* Some things on the list might be unconditionally included because of
18942          * other components.  Remove them, and clean up the list if it goes to
18943          * 0 elements */
18944         if (only_utf8_locale_list && cp_list) {
18945             _invlist_subtract(only_utf8_locale_list, cp_list,
18946                               &only_utf8_locale_list);
18947
18948             if (_invlist_len(only_utf8_locale_list) == 0) {
18949                 SvREFCNT_dec_NN(only_utf8_locale_list);
18950                 only_utf8_locale_list = NULL;
18951             }
18952         }
18953         if (    only_utf8_locale_list
18954             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18955                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18956         {
18957             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18958             anyof_flags
18959                  |= ANYOFL_FOLD
18960                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18961         }
18962         else if (cp_list && invlist_lowest(cp_list) < 256) {
18963             /* If nothing is below 256, has no locale dependency; otherwise it
18964              * does */
18965             anyof_flags |= ANYOFL_FOLD;
18966             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18967         }
18968     }
18969     else if (   DEPENDS_SEMANTICS
18970              && (    upper_latin1_only_utf8_matches
18971                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18972     {
18973         RExC_seen_d_op = TRUE;
18974         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18975     }
18976
18977     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18978      * compile time. */
18979     if (     cp_list
18980         &&   invert
18981         && ! has_runtime_dependency)
18982     {
18983         _invlist_invert(cp_list);
18984
18985         /* Clear the invert flag since have just done it here */
18986         invert = FALSE;
18987     }
18988
18989     /* All possible optimizations below still have these characteristics.
18990      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18991      * routine) */
18992     *flagp |= HASWIDTH|SIMPLE;
18993
18994     if (ret_invlist) {
18995         *ret_invlist = cp_list;
18996
18997         return (cp_list) ? RExC_emit : 0;
18998     }
18999
19000     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19001         RExC_contains_locale = 1;
19002     }
19003
19004     /* Some character classes are equivalent to other nodes.  Such nodes take
19005      * up less room, and some nodes require fewer operations to execute, than
19006      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
19007      * improve efficiency. */
19008
19009     if (optimizable) {
19010         PERL_UINT_FAST8_T i;
19011         UV partial_cp_count = 0;
19012         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19013         UV   end[MAX_FOLD_FROMS+1] = { 0 };
19014         bool single_range = FALSE;
19015
19016         if (cp_list) { /* Count the code points in enough ranges that we would
19017                           see all the ones possible in any fold in this version
19018                           of Unicode */
19019
19020             invlist_iterinit(cp_list);
19021             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19022                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19023                     break;
19024                 }
19025                 partial_cp_count += end[i] - start[i] + 1;
19026             }
19027
19028             if (i == 1) {
19029                 single_range = TRUE;
19030             }
19031             invlist_iterfinish(cp_list);
19032         }
19033
19034         /* If we know at compile time that this matches every possible code
19035          * point, any run-time dependencies don't matter */
19036         if (start[0] == 0 && end[0] == UV_MAX) {
19037             if (invert) {
19038                 ret = reganode(pRExC_state, OPFAIL, 0);
19039             }
19040             else {
19041                 ret = reg_node(pRExC_state, SANY);
19042                 MARK_NAUGHTY(1);
19043             }
19044             goto not_anyof;
19045         }
19046
19047         /* Similarly, for /l posix classes, if both a class and its
19048          * complement match, any run-time dependencies don't matter */
19049         if (posixl) {
19050             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19051                                                         namedclass += 2)
19052             {
19053                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
19054                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19055                 {
19056                     if (invert) {
19057                         ret = reganode(pRExC_state, OPFAIL, 0);
19058                     }
19059                     else {
19060                         ret = reg_node(pRExC_state, SANY);
19061                         MARK_NAUGHTY(1);
19062                     }
19063                     goto not_anyof;
19064                 }
19065             }
19066
19067             /* For well-behaved locales, some classes are subsets of others,
19068              * so complementing the subset and including the non-complemented
19069              * superset should match everything, like [\D[:alnum:]], and
19070              * [[:^alpha:][:alnum:]], but some implementations of locales are
19071              * buggy, and khw thinks its a bad idea to have optimization change
19072              * behavior, even if it avoids an OS bug in a given case */
19073
19074 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19075
19076             /* If is a single posix /l class, can optimize to just that op.
19077              * Such a node will not match anything in the Latin1 range, as that
19078              * is not determinable until runtime, but will match whatever the
19079              * class does outside that range.  (Note that some classes won't
19080              * match anything outside the range, like [:ascii:]) */
19081             if (    isSINGLE_BIT_SET(posixl)
19082                 && (partial_cp_count == 0 || start[0] > 255))
19083             {
19084                 U8 classnum;
19085                 SV * class_above_latin1 = NULL;
19086                 bool already_inverted;
19087                 bool are_equivalent;
19088
19089                 /* Compute which bit is set, which is the same thing as, e.g.,
19090                  * ANYOF_CNTRL.  From
19091                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19092                  * */
19093                 static const int MultiplyDeBruijnBitPosition2[32] =
19094                     {
19095                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19096                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19097                     };
19098
19099                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19100                                                           * 0x077CB531U) >> 27];
19101                 classnum = namedclass_to_classnum(namedclass);
19102
19103                 /* The named classes are such that the inverted number is one
19104                  * larger than the non-inverted one */
19105                 already_inverted = namedclass
19106                                  - classnum_to_namedclass(classnum);
19107
19108                 /* Create an inversion list of the official property, inverted
19109                  * if the constructed node list is inverted, and restricted to
19110                  * only the above latin1 code points, which are the only ones
19111                  * known at compile time */
19112                 _invlist_intersection_maybe_complement_2nd(
19113                                                     PL_AboveLatin1,
19114                                                     PL_XPosix_ptrs[classnum],
19115                                                     already_inverted,
19116                                                     &class_above_latin1);
19117                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19118                                                                         FALSE);
19119                 SvREFCNT_dec_NN(class_above_latin1);
19120
19121                 if (are_equivalent) {
19122
19123                     /* Resolve the run-time inversion flag with this possibly
19124                      * inverted class */
19125                     invert = invert ^ already_inverted;
19126
19127                     ret = reg_node(pRExC_state,
19128                                    POSIXL + invert * (NPOSIXL - POSIXL));
19129                     FLAGS(REGNODE_p(ret)) = classnum;
19130                     goto not_anyof;
19131                 }
19132             }
19133         }
19134
19135         /* khw can't think of any other possible transformation involving
19136          * these. */
19137         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19138             goto is_anyof;
19139         }
19140
19141         if (! has_runtime_dependency) {
19142
19143             /* If the list is empty, nothing matches.  This happens, for
19144              * example, when a Unicode property that doesn't match anything is
19145              * the only element in the character class (perluniprops.pod notes
19146              * such properties). */
19147             if (partial_cp_count == 0) {
19148                 if (invert) {
19149                     ret = reg_node(pRExC_state, SANY);
19150                 }
19151                 else {
19152                     ret = reganode(pRExC_state, OPFAIL, 0);
19153                 }
19154
19155                 goto not_anyof;
19156             }
19157
19158             /* If matches everything but \n */
19159             if (   start[0] == 0 && end[0] == '\n' - 1
19160                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19161             {
19162                 assert (! invert);
19163                 ret = reg_node(pRExC_state, REG_ANY);
19164                 MARK_NAUGHTY(1);
19165                 goto not_anyof;
19166             }
19167         }
19168
19169         /* Next see if can optimize classes that contain just a few code points
19170          * into an EXACTish node.  The reason to do this is to let the
19171          * optimizer join this node with adjacent EXACTish ones, and ANYOF
19172          * nodes require conversion to code point from UTF-8.
19173          *
19174          * An EXACTFish node can be generated even if not under /i, and vice
19175          * versa.  But care must be taken.  An EXACTFish node has to be such
19176          * that it only matches precisely the code points in the class, but we
19177          * want to generate the least restrictive one that does that, to
19178          * increase the odds of being able to join with an adjacent node.  For
19179          * example, if the class contains [kK], we have to make it an EXACTFAA
19180          * node to prevent the KELVIN SIGN from matching.  Whether we are under
19181          * /i or not is irrelevant in this case.  Less obvious is the pattern
19182          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
19183          * supposed to match the single character U+0149 LATIN SMALL LETTER N
19184          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
19185          * that includes \X{02BC}, there is a multi-char fold that does, and so
19186          * the node generated for it must be an EXACTFish one.  On the other
19187          * hand qr/:/i should generate a plain EXACT node since the colon
19188          * participates in no fold whatsoever, and having it EXACT tells the
19189          * optimizer the target string cannot match unless it has a colon in
19190          * it.
19191          */
19192         if (   ! posixl
19193             && ! invert
19194
19195                 /* Only try if there are no more code points in the class than
19196                  * in the max possible fold */
19197             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19198         {
19199             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19200             {
19201                 /* We can always make a single code point class into an
19202                  * EXACTish node. */
19203
19204                 if (LOC) {
19205
19206                     /* Here is /l:  Use EXACTL, except if there is a fold not
19207                      * known until runtime so shows as only a single code point
19208                      * here.  For code points above 255, we know which can
19209                      * cause problems by having a potential fold to the Latin1
19210                      * range. */
19211                     if (  ! FOLD
19212                         || (     start[0] > 255
19213                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19214                     {
19215                         op = EXACTL;
19216                     }
19217                     else {
19218                         op = EXACTFL;
19219                     }
19220                 }
19221                 else if (! FOLD) { /* Not /l and not /i */
19222                     op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19223                 }
19224                 else if (start[0] < 256) { /* /i, not /l, and the code point is
19225                                               small */
19226
19227                     /* Under /i, it gets a little tricky.  A code point that
19228                      * doesn't participate in a fold should be an EXACT node.
19229                      * We know this one isn't the result of a simple fold, or
19230                      * there'd be more than one code point in the list, but it
19231                      * could be part of a multi- character fold.  In that case
19232                      * we better not create an EXACT node, as we would wrongly
19233                      * be telling the optimizer that this code point must be in
19234                      * the target string, and that is wrong.  This is because
19235                      * if the sequence around this code point forms a
19236                      * multi-char fold, what needs to be in the string could be
19237                      * the code point that folds to the sequence.
19238                      *
19239                      * This handles the case of below-255 code points, as we
19240                      * have an easy look up for those.  The next clause handles
19241                      * the above-256 one */
19242                     op = IS_IN_SOME_FOLD_L1(start[0])
19243                          ? EXACTFU
19244                          : EXACT;
19245                 }
19246                 else {  /* /i, larger code point.  Since we are under /i, and
19247                            have just this code point, we know that it can't
19248                            fold to something else, so PL_InMultiCharFold
19249                            applies to it */
19250                     op = _invlist_contains_cp(PL_InMultiCharFold,
19251                                               start[0])
19252                          ? EXACTFU_REQ8
19253                          : EXACT_REQ8;
19254                 }
19255
19256                 value = start[0];
19257             }
19258             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19259                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
19260             {
19261                 /* Here, the only runtime dependency, if any, is from /d, and
19262                  * the class matches more than one code point, and the lowest
19263                  * code point participates in some fold.  It might be that the
19264                  * other code points are /i equivalent to this one, and hence
19265                  * they would representable by an EXACTFish node.  Above, we
19266                  * eliminated classes that contain too many code points to be
19267                  * EXACTFish, with the test for MAX_FOLD_FROMS
19268                  *
19269                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
19270                  * We do this because we have EXACTFAA at our disposal for the
19271                  * ASCII range */
19272                 if (partial_cp_count == 2 && isASCII(start[0])) {
19273
19274                     /* The only ASCII characters that participate in folds are
19275                      * alphabetics */
19276                     assert(isALPHA(start[0]));
19277                     if (   end[0] == start[0]   /* First range is a single
19278                                                    character, so 2nd exists */
19279                         && isALPHA_FOLD_EQ(start[0], start[1]))
19280                     {
19281
19282                         /* Here, is part of an ASCII fold pair */
19283
19284                         if (   ASCII_FOLD_RESTRICTED
19285                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19286                         {
19287                             /* If the second clause just above was true, it
19288                              * means we can't be under /i, or else the list
19289                              * would have included more than this fold pair.
19290                              * Therefore we have to exclude the possibility of
19291                              * whatever else it is that folds to these, by
19292                              * using EXACTFAA */
19293                             op = EXACTFAA;
19294                         }
19295                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19296
19297                             /* Here, there's no simple fold that start[0] is part
19298                              * of, but there is a multi-character one.  If we
19299                              * are not under /i, we want to exclude that
19300                              * possibility; if under /i, we want to include it
19301                              * */
19302                             op = (FOLD) ? EXACTFU : EXACTFAA;
19303                         }
19304                         else {
19305
19306                             /* Here, the only possible fold start[0] particpates in
19307                              * is with start[1].  /i or not isn't relevant */
19308                             op = EXACTFU;
19309                         }
19310
19311                         value = toFOLD(start[0]);
19312                     }
19313                 }
19314                 else if (  ! upper_latin1_only_utf8_matches
19315                          || (   _invlist_len(upper_latin1_only_utf8_matches)
19316                                                                           == 2
19317                              && PL_fold_latin1[
19318                                invlist_highest(upper_latin1_only_utf8_matches)]
19319                              == start[0]))
19320                 {
19321                     /* Here, the smallest character is non-ascii or there are
19322                      * more than 2 code points matched by this node.  Also, we
19323                      * either don't have /d UTF-8 dependent matches, or if we
19324                      * do, they look like they could be a single character that
19325                      * is the fold of the lowest one in the always-match list.
19326                      * This test quickly excludes most of the false positives
19327                      * when there are /d UTF-8 depdendent matches.  These are
19328                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19329                      * SMALL LETTER A WITH GRAVE iff the target string is
19330                      * UTF-8.  (We don't have to worry above about exceeding
19331                      * the array bounds of PL_fold_latin1[] because any code
19332                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
19333                      *
19334                      * EXACTFAA would apply only to pairs (hence exactly 2 code
19335                      * points) in the ASCII range, so we can't use it here to
19336                      * artificially restrict the fold domain, so we check if
19337                      * the class does or does not match some EXACTFish node.
19338                      * Further, if we aren't under /i, and the folded-to
19339                      * character is part of a multi-character fold, we can't do
19340                      * this optimization, as the sequence around it could be
19341                      * that multi-character fold, and we don't here know the
19342                      * context, so we have to assume it is that multi-char
19343                      * fold, to prevent potential bugs.
19344                      *
19345                      * To do the general case, we first find the fold of the
19346                      * lowest code point (which may be higher than the lowest
19347                      * one), then find everything that folds to it.  (The data
19348                      * structure we have only maps from the folded code points,
19349                      * so we have to do the earlier step.) */
19350
19351                     Size_t foldlen;
19352                     U8 foldbuf[UTF8_MAXBYTES_CASE];
19353                     UV folded = _to_uni_fold_flags(start[0],
19354                                                         foldbuf, &foldlen, 0);
19355                     U32 first_fold;
19356                     const U32 * remaining_folds;
19357                     Size_t folds_to_this_cp_count = _inverse_folds(
19358                                                             folded,
19359                                                             &first_fold,
19360                                                             &remaining_folds);
19361                     Size_t folds_count = folds_to_this_cp_count + 1;
19362                     SV * fold_list = _new_invlist(folds_count);
19363                     unsigned int i;
19364
19365                     /* If there are UTF-8 dependent matches, create a temporary
19366                      * list of what this node matches, including them. */
19367                     SV * all_cp_list = NULL;
19368                     SV ** use_this_list = &cp_list;
19369
19370                     if (upper_latin1_only_utf8_matches) {
19371                         all_cp_list = _new_invlist(0);
19372                         use_this_list = &all_cp_list;
19373                         _invlist_union(cp_list,
19374                                        upper_latin1_only_utf8_matches,
19375                                        use_this_list);
19376                     }
19377
19378                     /* Having gotten everything that participates in the fold
19379                      * containing the lowest code point, we turn that into an
19380                      * inversion list, making sure everything is included. */
19381                     fold_list = add_cp_to_invlist(fold_list, start[0]);
19382                     fold_list = add_cp_to_invlist(fold_list, folded);
19383                     if (folds_to_this_cp_count > 0) {
19384                         fold_list = add_cp_to_invlist(fold_list, first_fold);
19385                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19386                             fold_list = add_cp_to_invlist(fold_list,
19387                                                         remaining_folds[i]);
19388                         }
19389                     }
19390
19391                     /* If the fold list is identical to what's in this ANYOF
19392                      * node, the node can be represented by an EXACTFish one
19393                      * instead */
19394                     if (_invlistEQ(*use_this_list, fold_list,
19395                                    0 /* Don't complement */ )
19396                     ) {
19397
19398                         /* But, we have to be careful, as mentioned above.
19399                          * Just the right sequence of characters could match
19400                          * this if it is part of a multi-character fold.  That
19401                          * IS what we want if we are under /i.  But it ISN'T
19402                          * what we want if not under /i, as it could match when
19403                          * it shouldn't.  So, when we aren't under /i and this
19404                          * character participates in a multi-char fold, we
19405                          * don't optimize into an EXACTFish node.  So, for each
19406                          * case below we have to check if we are folding
19407                          * and if not, if it is not part of a multi-char fold.
19408                          * */
19409                         if (start[0] > 255) {    /* Highish code point */
19410                             if (FOLD || ! _invlist_contains_cp(
19411                                             PL_InMultiCharFold, folded))
19412                             {
19413                                 op = (LOC)
19414                                      ? EXACTFLU8
19415                                      : (ASCII_FOLD_RESTRICTED)
19416                                        ? EXACTFAA
19417                                        : EXACTFU_REQ8;
19418                                 value = folded;
19419                             }
19420                         }   /* Below, the lowest code point < 256 */
19421                         else if (    FOLD
19422                                  &&  folded == 's'
19423                                  &&  DEPENDS_SEMANTICS)
19424                         {   /* An EXACTF node containing a single character
19425                                 's', can be an EXACTFU if it doesn't get
19426                                 joined with an adjacent 's' */
19427                             op = EXACTFU_S_EDGE;
19428                             value = folded;
19429                         }
19430                         else if (    FOLD
19431                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19432                         {
19433                             if (upper_latin1_only_utf8_matches) {
19434                                 op = EXACTF;
19435
19436                                 /* We can't use the fold, as that only matches
19437                                  * under UTF-8 */
19438                                 value = start[0];
19439                             }
19440                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
19441                                      && ! UTF)
19442                             {   /* EXACTFUP is a special node for this
19443                                    character */
19444                                 op = (ASCII_FOLD_RESTRICTED)
19445                                      ? EXACTFAA
19446                                      : EXACTFUP;
19447                                 value = MICRO_SIGN;
19448                             }
19449                             else if (     ASCII_FOLD_RESTRICTED
19450                                      && ! isASCII(start[0]))
19451                             {   /* For ASCII under /iaa, we can use EXACTFU
19452                                    below */
19453                                 op = EXACTFAA;
19454                                 value = folded;
19455                             }
19456                             else {
19457                                 op = EXACTFU;
19458                                 value = folded;
19459                             }
19460                         }
19461                     }
19462
19463                     SvREFCNT_dec_NN(fold_list);
19464                     SvREFCNT_dec(all_cp_list);
19465                 }
19466             }
19467
19468             if (op != END) {
19469                 U8 len;
19470
19471                 /* Here, we have calculated what EXACTish node to use.  Have to
19472                  * convert to UTF-8 if not already there */
19473                 if (value > 255) {
19474                     if (! UTF) {
19475                         SvREFCNT_dec(cp_list);;
19476                         REQUIRE_UTF8(flagp);
19477                     }
19478
19479                     /* This is a kludge to the special casing issues with this
19480                      * ligature under /aa.  FB05 should fold to FB06, but the
19481                      * call above to _to_uni_fold_flags() didn't find this, as
19482                      * it didn't use the /aa restriction in order to not miss
19483                      * other folds that would be affected.  This is the only
19484                      * instance likely to ever be a problem in all of Unicode.
19485                      * So special case it. */
19486                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
19487                         && ASCII_FOLD_RESTRICTED)
19488                     {
19489                         value = LATIN_SMALL_LIGATURE_ST;
19490                     }
19491                 }
19492
19493                 len = (UTF) ? UVCHR_SKIP(value) : 1;
19494
19495                 ret = regnode_guts(pRExC_state, op, len, "exact");
19496                 FILL_NODE(ret, op);
19497                 RExC_emit += 1 + STR_SZ(len);
19498                 setSTR_LEN(REGNODE_p(ret), len);
19499                 if (len == 1) {
19500                     *STRINGs(REGNODE_p(ret)) = (U8) value;
19501                 }
19502                 else {
19503                     uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19504                 }
19505                 goto not_anyof;
19506             }
19507         }
19508
19509         if (! has_runtime_dependency) {
19510
19511             /* See if this can be turned into an ANYOFM node.  Think about the
19512              * bit patterns in two different bytes.  In some positions, the
19513              * bits in each will be 1; and in other positions both will be 0;
19514              * and in some positions the bit will be 1 in one byte, and 0 in
19515              * the other.  Let 'n' be the number of positions where the bits
19516              * differ.  We create a mask which has exactly 'n' 0 bits, each in
19517              * a position where the two bytes differ.  Now take the set of all
19518              * bytes that when ANDed with the mask yield the same result.  That
19519              * set has 2**n elements, and is representable by just two 8 bit
19520              * numbers: the result and the mask.  Importantly, matching the set
19521              * can be vectorized by creating a word full of the result bytes,
19522              * and a word full of the mask bytes, yielding a significant speed
19523              * up.  Here, see if this node matches such a set.  As a concrete
19524              * example consider [01], and the byte representing '0' which is
19525              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
19526              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
19527              * 0x30.  Any other bytes ANDed yield something else.  So [01],
19528              * which is a common usage, is optimizable into ANYOFM, and can
19529              * benefit from the speed up.  We can only do this on UTF-8
19530              * invariant bytes, because they have the same bit patterns under
19531              * UTF-8 as not. */
19532             PERL_UINT_FAST8_T inverted = 0;
19533 #ifdef EBCDIC
19534             const PERL_UINT_FAST8_T max_permissible = 0xFF;
19535 #else
19536             const PERL_UINT_FAST8_T max_permissible = 0x7F;
19537 #endif
19538             /* If doesn't fit the criteria for ANYOFM, invert and try again.
19539              * If that works we will instead later generate an NANYOFM, and
19540              * invert back when through */
19541             if (invlist_highest(cp_list) > max_permissible) {
19542                 _invlist_invert(cp_list);
19543                 inverted = 1;
19544             }
19545
19546             if (invlist_highest(cp_list) <= max_permissible) {
19547                 UV this_start, this_end;
19548                 UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
19549                 U8 bits_differing = 0;
19550                 Size_t full_cp_count = 0;
19551                 bool first_time = TRUE;
19552
19553                 /* Go through the bytes and find the bit positions that differ
19554                  * */
19555                 invlist_iterinit(cp_list);
19556                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19557                     unsigned int i = this_start;
19558
19559                     if (first_time) {
19560                         if (! UVCHR_IS_INVARIANT(i)) {
19561                             goto done_anyofm;
19562                         }
19563
19564                         first_time = FALSE;
19565                         lowest_cp = this_start;
19566
19567                         /* We have set up the code point to compare with.
19568                          * Don't compare it with itself */
19569                         i++;
19570                     }
19571
19572                     /* Find the bit positions that differ from the lowest code
19573                      * point in the node.  Keep track of all such positions by
19574                      * OR'ing */
19575                     for (; i <= this_end; i++) {
19576                         if (! UVCHR_IS_INVARIANT(i)) {
19577                             goto done_anyofm;
19578                         }
19579
19580                         bits_differing  |= i ^ lowest_cp;
19581                     }
19582
19583                     full_cp_count += this_end - this_start + 1;
19584                 }
19585
19586                 /* At the end of the loop, we count how many bits differ from
19587                  * the bits in lowest code point, call the count 'd'.  If the
19588                  * set we found contains 2**d elements, it is the closure of
19589                  * all code points that differ only in those bit positions.  To
19590                  * convince yourself of that, first note that the number in the
19591                  * closure must be a power of 2, which we test for.  The only
19592                  * way we could have that count and it be some differing set,
19593                  * is if we got some code points that don't differ from the
19594                  * lowest code point in any position, but do differ from each
19595                  * other in some other position.  That means one code point has
19596                  * a 1 in that position, and another has a 0.  But that would
19597                  * mean that one of them differs from the lowest code point in
19598                  * that position, which possibility we've already excluded.  */
19599                 if (  (inverted || full_cp_count > 1)
19600                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19601                 {
19602                     U8 ANYOFM_mask;
19603
19604                     op = ANYOFM + inverted;;
19605
19606                     /* We need to make the bits that differ be 0's */
19607                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19608
19609                     /* The argument is the lowest code point */
19610                     ret = reganode(pRExC_state, op, lowest_cp);
19611                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19612                 }
19613
19614               done_anyofm:
19615                 invlist_iterfinish(cp_list);
19616             }
19617
19618             if (inverted) {
19619                 _invlist_invert(cp_list);
19620             }
19621
19622             if (op != END) {
19623                 goto not_anyof;
19624             }
19625
19626             /* XXX We could create an ANYOFR_LOW node here if we saved above if
19627              * all were invariants, it wasn't inverted, and there is a single
19628              * range.  This would be faster than some of the posix nodes we
19629              * create below like /\d/a, but would be twice the size.  Without
19630              * having actually measured the gain, khw doesn't think the
19631              * tradeoff is really worth it */
19632         }
19633
19634         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19635             PERL_UINT_FAST8_T type;
19636             SV * intersection = NULL;
19637             SV* d_invlist = NULL;
19638
19639             /* See if this matches any of the POSIX classes.  The POSIXA and
19640              * POSIXD ones are about the same speed as ANYOF ops, but take less
19641              * room; the ones that have above-Latin1 code point matches are
19642              * somewhat faster than ANYOF.  */
19643
19644             for (type = POSIXA; type >= POSIXD; type--) {
19645                 int posix_class;
19646
19647                 if (type == POSIXL) {   /* But not /l posix classes */
19648                     continue;
19649                 }
19650
19651                 for (posix_class = 0;
19652                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19653                      posix_class++)
19654                 {
19655                     SV** our_code_points = &cp_list;
19656                     SV** official_code_points;
19657                     int try_inverted;
19658
19659                     if (type == POSIXA) {
19660                         official_code_points = &PL_Posix_ptrs[posix_class];
19661                     }
19662                     else {
19663                         official_code_points = &PL_XPosix_ptrs[posix_class];
19664                     }
19665
19666                     /* Skip non-existent classes of this type.  e.g. \v only
19667                      * has an entry in PL_XPosix_ptrs */
19668                     if (! *official_code_points) {
19669                         continue;
19670                     }
19671
19672                     /* Try both the regular class, and its inversion */
19673                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19674                         bool this_inverted = invert ^ try_inverted;
19675
19676                         if (type != POSIXD) {
19677
19678                             /* This class that isn't /d can't match if we have
19679                              * /d dependencies */
19680                             if (has_runtime_dependency
19681                                                     & HAS_D_RUNTIME_DEPENDENCY)
19682                             {
19683                                 continue;
19684                             }
19685                         }
19686                         else /* is /d */ if (! this_inverted) {
19687
19688                             /* /d classes don't match anything non-ASCII below
19689                              * 256 unconditionally (which cp_list contains) */
19690                             _invlist_intersection(cp_list, PL_UpperLatin1,
19691                                                            &intersection);
19692                             if (_invlist_len(intersection) != 0) {
19693                                 continue;
19694                             }
19695
19696                             SvREFCNT_dec(d_invlist);
19697                             d_invlist = invlist_clone(cp_list, NULL);
19698
19699                             /* But under UTF-8 it turns into using /u rules.
19700                              * Add the things it matches under these conditions
19701                              * so that we check below that these are identical
19702                              * to what the tested class should match */
19703                             if (upper_latin1_only_utf8_matches) {
19704                                 _invlist_union(
19705                                             d_invlist,
19706                                             upper_latin1_only_utf8_matches,
19707                                             &d_invlist);
19708                             }
19709                             our_code_points = &d_invlist;
19710                         }
19711                         else {  /* POSIXD, inverted.  If this doesn't have this
19712                                    flag set, it isn't /d. */
19713                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19714                             {
19715                                 continue;
19716                             }
19717                             our_code_points = &cp_list;
19718                         }
19719
19720                         /* Here, have weeded out some things.  We want to see
19721                          * if the list of characters this node contains
19722                          * ('*our_code_points') precisely matches those of the
19723                          * class we are currently checking against
19724                          * ('*official_code_points'). */
19725                         if (_invlistEQ(*our_code_points,
19726                                        *official_code_points,
19727                                        try_inverted))
19728                         {
19729                             /* Here, they precisely match.  Optimize this ANYOF
19730                              * node into its equivalent POSIX one of the
19731                              * correct type, possibly inverted */
19732                             ret = reg_node(pRExC_state, (try_inverted)
19733                                                         ? type + NPOSIXA
19734                                                                 - POSIXA
19735                                                         : type);
19736                             FLAGS(REGNODE_p(ret)) = posix_class;
19737                             SvREFCNT_dec(d_invlist);
19738                             SvREFCNT_dec(intersection);
19739                             goto not_anyof;
19740                         }
19741                     }
19742                 }
19743             }
19744             SvREFCNT_dec(d_invlist);
19745             SvREFCNT_dec(intersection);
19746         }
19747
19748         /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19749          * both in size and speed.  Currently, a 20 bit range base (smallest
19750          * code point in the range), and a 12 bit maximum delta are packed into
19751          * a 32 bit word.  This allows for using it on all of the Unicode code
19752          * points except for the highest plane, which is only for private use
19753          * code points.  khw doubts that a bigger delta is likely in real world
19754          * applications */
19755         if (     single_range
19756             && ! has_runtime_dependency
19757             &&   anyof_flags == 0
19758             &&   start[0] < (1 << ANYOFR_BASE_BITS)
19759             &&   end[0] - start[0]
19760                     < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19761                                    * CHARBITS - ANYOFR_BASE_BITS))))
19762
19763         {
19764             U8 low_utf8[UTF8_MAXBYTES+1];
19765             U8 high_utf8[UTF8_MAXBYTES+1];
19766
19767             ret = reganode(pRExC_state, ANYOFR,
19768                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19769
19770             /* Place the lowest UTF-8 start byte in the flags field, so as to
19771              * allow efficient ruling out at run time of many possible inputs.
19772              * */
19773             (void) uvchr_to_utf8(low_utf8, start[0]);
19774             (void) uvchr_to_utf8(high_utf8, end[0]);
19775
19776             /* If all code points share the same first byte, this can be an
19777              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
19778              * quickly rule out many inputs at run-time without having to
19779              * compute the code point from UTF-8.  For EBCDIC, we use I8, as
19780              * not doing that transformation would not rule out nearly so many
19781              * things */
19782             if (low_utf8[0] == high_utf8[0]) {
19783                 OP(REGNODE_p(ret)) = ANYOFRb;
19784                 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19785             }
19786             else {
19787                 ANYOF_FLAGS(REGNODE_p(ret))
19788                                     = NATIVE_UTF8_TO_I8(low_utf8[0]);
19789             }
19790
19791             goto not_anyof;
19792         }
19793
19794         /* If didn't find an optimization and there is no need for a bitmap,
19795          * optimize to indicate that */
19796         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19797             && ! LOC
19798             && ! upper_latin1_only_utf8_matches
19799             &&   anyof_flags == 0)
19800         {
19801             U8 low_utf8[UTF8_MAXBYTES+1];
19802             UV highest_cp = invlist_highest(cp_list);
19803
19804             /* Currently the maximum allowed code point by the system is
19805              * IV_MAX.  Higher ones are reserved for future internal use.  This
19806              * particular regnode can be used for higher ones, but we can't
19807              * calculate the code point of those.  IV_MAX suffices though, as
19808              * it will be a large first byte */
19809             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19810                            - low_utf8;
19811
19812             /* We store the lowest possible first byte of the UTF-8
19813              * representation, using the flags field.  This allows for quick
19814              * ruling out of some inputs without having to convert from UTF-8
19815              * to code point.  For EBCDIC, we use I8, as not doing that
19816              * transformation would not rule out nearly so many things */
19817             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19818
19819             op = ANYOFH;
19820
19821             /* If the first UTF-8 start byte for the highest code point in the
19822              * range is suitably small, we may be able to get an upper bound as
19823              * well */
19824             if (highest_cp <= IV_MAX) {
19825                 U8 high_utf8[UTF8_MAXBYTES+1];
19826                 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19827                                 - high_utf8;
19828
19829                 /* If the lowest and highest are the same, we can get an exact
19830                  * first byte instead of a just minimum or even a sequence of
19831                  * exact leading bytes.  We signal these with different
19832                  * regnodes */
19833                 if (low_utf8[0] == high_utf8[0]) {
19834                     Size_t len = find_first_differing_byte_pos(low_utf8,
19835                                                                high_utf8,
19836                                                        MIN(low_len, high_len));
19837
19838                     if (len == 1) {
19839
19840                         /* No need to convert to I8 for EBCDIC as this is an
19841                          * exact match */
19842                         anyof_flags = low_utf8[0];
19843                         op = ANYOFHb;
19844                     }
19845                     else {
19846                         op = ANYOFHs;
19847                         ret = regnode_guts(pRExC_state, op,
19848                                            regarglen[op] + STR_SZ(len),
19849                                            "anyofhs");
19850                         FILL_NODE(ret, op);
19851                         ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19852                                                                         = len;
19853                         Copy(low_utf8,  /* Add the common bytes */
19854                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19855                            len, U8);
19856                         RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19857                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19858                                                   NULL, only_utf8_locale_list);
19859                         goto not_anyof;
19860                     }
19861                 }
19862                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19863                 {
19864
19865                     /* Here, the high byte is not the same as the low, but is
19866                      * small enough that its reasonable to have a loose upper
19867                      * bound, which is packed in with the strict lower bound.
19868                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19869                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
19870                      * is the same thing as UTF-8 */
19871
19872                     U8 bits = 0;
19873                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19874                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19875                                   - anyof_flags;
19876
19877                     if (range_diff <= max_range_diff / 8) {
19878                         bits = 3;
19879                     }
19880                     else if (range_diff <= max_range_diff / 4) {
19881                         bits = 2;
19882                     }
19883                     else if (range_diff <= max_range_diff / 2) {
19884                         bits = 1;
19885                     }
19886                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19887                     op = ANYOFHr;
19888                 }
19889             }
19890
19891             goto done_finding_op;
19892         }
19893     }   /* End of seeing if can optimize it into a different node */
19894
19895   is_anyof: /* It's going to be an ANYOF node. */
19896     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19897          ? ANYOFD
19898          : ((posixl)
19899             ? ANYOFPOSIXL
19900             : ((LOC)
19901                ? ANYOFL
19902                : ANYOF));
19903
19904   done_finding_op:
19905
19906     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19907     FILL_NODE(ret, op);        /* We set the argument later */
19908     RExC_emit += 1 + regarglen[op];
19909     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19910
19911     /* Here, <cp_list> contains all the code points we can determine at
19912      * compile time that match under all conditions.  Go through it, and
19913      * for things that belong in the bitmap, put them there, and delete from
19914      * <cp_list>.  While we are at it, see if everything above 255 is in the
19915      * list, and if so, set a flag to speed up execution */
19916
19917     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19918
19919     if (posixl) {
19920         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19921     }
19922
19923     if (invert) {
19924         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19925     }
19926
19927     /* Here, the bitmap has been populated with all the Latin1 code points that
19928      * always match.  Can now add to the overall list those that match only
19929      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19930      * */
19931     if (upper_latin1_only_utf8_matches) {
19932         if (cp_list) {
19933             _invlist_union(cp_list,
19934                            upper_latin1_only_utf8_matches,
19935                            &cp_list);
19936             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19937         }
19938         else {
19939             cp_list = upper_latin1_only_utf8_matches;
19940         }
19941         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19942     }
19943
19944     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19945                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19946                    ? listsv
19947                    : NULL,
19948                   only_utf8_locale_list);
19949     SvREFCNT_dec(cp_list);;
19950     SvREFCNT_dec(only_utf8_locale_list);
19951     return ret;
19952
19953   not_anyof:
19954
19955     /* Here, the node is getting optimized into something that's not an ANYOF
19956      * one.  Finish up. */
19957
19958     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19959                                            RExC_parse - orig_parse);;
19960     SvREFCNT_dec(cp_list);;
19961     SvREFCNT_dec(only_utf8_locale_list);
19962     return ret;
19963 }
19964
19965 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19966
19967 STATIC void
19968 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19969                 regnode* const node,
19970                 SV* const cp_list,
19971                 SV* const runtime_defns,
19972                 SV* const only_utf8_locale_list)
19973 {
19974     /* Sets the arg field of an ANYOF-type node 'node', using information about
19975      * the node passed-in.  If there is nothing outside the node's bitmap, the
19976      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19977      * the count returned by add_data(), having allocated and stored an array,
19978      * av, as follows:
19979      *
19980      *  av[0] stores the inversion list defining this class as far as known at
19981      *        this time, or PL_sv_undef if nothing definite is now known.
19982      *  av[1] stores the inversion list of code points that match only if the
19983      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
19984      *        av[2], or no entry otherwise.
19985      *  av[2] stores the list of user-defined properties whose subroutine
19986      *        definitions aren't known at this time, or no entry if none. */
19987
19988     UV n;
19989
19990     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19991
19992     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19993         assert(! (ANYOF_FLAGS(node)
19994                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19995         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19996     }
19997     else {
19998         AV * const av = newAV();
19999         SV *rv;
20000
20001         if (cp_list) {
20002             av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20003         }
20004
20005         /* (Note that if any of this changes, the size calculations in
20006          * S_optimize_regclass() might need to be updated.) */
20007
20008         if (only_utf8_locale_list) {
20009             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20010                                      SvREFCNT_inc_NN(only_utf8_locale_list));
20011         }
20012
20013         if (runtime_defns) {
20014             av_store(av, DEFERRED_USER_DEFINED_INDEX,
20015                          SvREFCNT_inc_NN(runtime_defns));
20016         }
20017
20018         rv = newRV_noinc(MUTABLE_SV(av));
20019         n = add_data(pRExC_state, STR_WITH_LEN("s"));
20020         RExC_rxi->data->data[n] = (void*)rv;
20021         ARG_SET(node, n);
20022     }
20023 }
20024
20025 SV *
20026
20027 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20028 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20029 #else
20030 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20031 #endif
20032
20033 {
20034     /* For internal core use only.
20035      * Returns the inversion list for the input 'node' in the regex 'prog'.
20036      * If <doinit> is 'true', will attempt to create the inversion list if not
20037      *    already done.
20038      * If <listsvp> is non-null, will return the printable contents of the
20039      *    property definition.  This can be used to get debugging information
20040      *    even before the inversion list exists, by calling this function with
20041      *    'doinit' set to false, in which case the components that will be used
20042      *    to eventually create the inversion list are returned  (in a printable
20043      *    form).
20044      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20045      *    store an inversion list of code points that should match only if the
20046      *    execution-time locale is a UTF-8 one.
20047      * If <output_invlist> is not NULL, it is where this routine is to store an
20048      *    inversion list of the code points that would be instead returned in
20049      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
20050      *    when this parameter is used, is just the non-code point data that
20051      *    will go into creating the inversion list.  This currently should be just
20052      *    user-defined properties whose definitions were not known at compile
20053      *    time.  Using this parameter allows for easier manipulation of the
20054      *    inversion list's data by the caller.  It is illegal to call this
20055      *    function with this parameter set, but not <listsvp>
20056      *
20057      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
20058      * that, in spite of this function's name, the inversion list it returns
20059      * may include the bitmap data as well */
20060
20061     SV *si  = NULL;         /* Input initialization string */
20062     SV* invlist = NULL;
20063
20064     RXi_GET_DECL(prog, progi);
20065     const struct reg_data * const data = prog ? progi->data : NULL;
20066
20067 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20068     PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20069 #else
20070     PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20071 #endif
20072     assert(! output_invlist || listsvp);
20073
20074     if (data && data->count) {
20075         const U32 n = ARG(node);
20076
20077         if (data->what[n] == 's') {
20078             SV * const rv = MUTABLE_SV(data->data[n]);
20079             AV * const av = MUTABLE_AV(SvRV(rv));
20080             SV **const ary = AvARRAY(av);
20081
20082             invlist = ary[INVLIST_INDEX];
20083
20084             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20085                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20086             }
20087
20088             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20089                 si = ary[DEFERRED_USER_DEFINED_INDEX];
20090             }
20091
20092             if (doinit && (si || invlist)) {
20093                 if (si) {
20094                     bool user_defined;
20095                     SV * msg = newSVpvs_flags("", SVs_TEMP);
20096
20097                     SV * prop_definition = handle_user_defined_property(
20098                             "", 0, FALSE,   /* There is no \p{}, \P{} */
20099                             SvPVX_const(si)[1] - '0',   /* /i or not has been
20100                                                            stored here for just
20101                                                            this occasion */
20102                             TRUE,           /* run time */
20103                             FALSE,          /* This call must find the defn */
20104                             si,             /* The property definition  */
20105                             &user_defined,
20106                             msg,
20107                             0               /* base level call */
20108                            );
20109
20110                     if (SvCUR(msg)) {
20111                         assert(prop_definition == NULL);
20112
20113                         Perl_croak(aTHX_ "%" UTF8f,
20114                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20115                     }
20116
20117                     if (invlist) {
20118                         _invlist_union(invlist, prop_definition, &invlist);
20119                         SvREFCNT_dec_NN(prop_definition);
20120                     }
20121                     else {
20122                         invlist = prop_definition;
20123                     }
20124
20125                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20126                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20127
20128                     ary[INVLIST_INDEX] = invlist;
20129                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20130                                  ? ONLY_LOCALE_MATCHES_INDEX
20131                                  : INVLIST_INDEX);
20132                     si = NULL;
20133                 }
20134             }
20135         }
20136     }
20137
20138     /* If requested, return a printable version of what this ANYOF node matches
20139      * */
20140     if (listsvp) {
20141         SV* matches_string = NULL;
20142
20143         /* This function can be called at compile-time, before everything gets
20144          * resolved, in which case we return the currently best available
20145          * information, which is the string that will eventually be used to do
20146          * that resolving, 'si' */
20147         if (si) {
20148             /* Here, we only have 'si' (and possibly some passed-in data in
20149              * 'invlist', which is handled below)  If the caller only wants
20150              * 'si', use that.  */
20151             if (! output_invlist) {
20152                 matches_string = newSVsv(si);
20153             }
20154             else {
20155                 /* But if the caller wants an inversion list of the node, we
20156                  * need to parse 'si' and place as much as possible in the
20157                  * desired output inversion list, making 'matches_string' only
20158                  * contain the currently unresolvable things */
20159                 const char *si_string = SvPVX(si);
20160                 STRLEN remaining = SvCUR(si);
20161                 UV prev_cp = 0;
20162                 U8 count = 0;
20163
20164                 /* Ignore everything before and including the first new-line */
20165                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20166                 assert (si_string != NULL);
20167                 si_string++;
20168                 remaining = SvPVX(si) + SvCUR(si) - si_string;
20169
20170                 while (remaining > 0) {
20171
20172                     /* The data consists of just strings defining user-defined
20173                      * property names, but in prior incarnations, and perhaps
20174                      * somehow from pluggable regex engines, it could still
20175                      * hold hex code point definitions, all of which should be
20176                      * legal (or it wouldn't have gotten this far).  Each
20177                      * component of a range would be separated by a tab, and
20178                      * each range by a new-line.  If these are found, instead
20179                      * add them to the inversion list */
20180                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
20181                                      |PERL_SCAN_SILENT_NON_PORTABLE;
20182                     STRLEN len = remaining;
20183                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20184
20185                     /* If the hex decode routine found something, it should go
20186                      * up to the next \n */
20187                     if (   *(si_string + len) == '\n') {
20188                         if (count) {    /* 2nd code point on line */
20189                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20190                         }
20191                         else {
20192                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20193                         }
20194                         count = 0;
20195                         goto prepare_for_next_iteration;
20196                     }
20197
20198                     /* If the hex decode was instead for the lower range limit,
20199                      * save it, and go parse the upper range limit */
20200                     if (*(si_string + len) == '\t') {
20201                         assert(count == 0);
20202
20203                         prev_cp = cp;
20204                         count = 1;
20205                       prepare_for_next_iteration:
20206                         si_string += len + 1;
20207                         remaining -= len + 1;
20208                         continue;
20209                     }
20210
20211                     /* Here, didn't find a legal hex number.  Just add the text
20212                      * from here up to the next \n, omitting any trailing
20213                      * markers. */
20214
20215                     remaining -= len;
20216                     len = strcspn(si_string,
20217                                         DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20218                     remaining -= len;
20219                     if (matches_string) {
20220                         sv_catpvn(matches_string, si_string, len);
20221                     }
20222                     else {
20223                         matches_string = newSVpvn(si_string, len);
20224                     }
20225                     sv_catpvs(matches_string, " ");
20226
20227                     si_string += len;
20228                     if (   remaining
20229                         && UCHARAT(si_string)
20230                                             == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20231                     {
20232                         si_string++;
20233                         remaining--;
20234                     }
20235                     if (remaining && UCHARAT(si_string) == '\n') {
20236                         si_string++;
20237                         remaining--;
20238                     }
20239                 } /* end of loop through the text */
20240
20241                 assert(matches_string);
20242                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
20243                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20244                 }
20245             } /* end of has an 'si' */
20246         }
20247
20248         /* Add the stuff that's already known */
20249         if (invlist) {
20250
20251             /* Again, if the caller doesn't want the output inversion list, put
20252              * everything in 'matches-string' */
20253             if (! output_invlist) {
20254                 if ( ! matches_string) {
20255                     matches_string = newSVpvs("\n");
20256                 }
20257                 sv_catsv(matches_string, invlist_contents(invlist,
20258                                                   TRUE /* traditional style */
20259                                                   ));
20260             }
20261             else if (! *output_invlist) {
20262                 *output_invlist = invlist_clone(invlist, NULL);
20263             }
20264             else {
20265                 _invlist_union(*output_invlist, invlist, output_invlist);
20266             }
20267         }
20268
20269         *listsvp = matches_string;
20270     }
20271
20272     return invlist;
20273 }
20274
20275 /* reg_skipcomment()
20276
20277    Absorbs an /x style # comment from the input stream,
20278    returning a pointer to the first character beyond the comment, or if the
20279    comment terminates the pattern without anything following it, this returns
20280    one past the final character of the pattern (in other words, RExC_end) and
20281    sets the REG_RUN_ON_COMMENT_SEEN flag.
20282
20283    Note it's the callers responsibility to ensure that we are
20284    actually in /x mode
20285
20286 */
20287
20288 PERL_STATIC_INLINE char*
20289 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20290 {
20291     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20292
20293     assert(*p == '#');
20294
20295     while (p < RExC_end) {
20296         if (*(++p) == '\n') {
20297             return p+1;
20298         }
20299     }
20300
20301     /* we ran off the end of the pattern without ending the comment, so we have
20302      * to add an \n when wrapping */
20303     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20304     return p;
20305 }
20306
20307 STATIC void
20308 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20309                                 char ** p,
20310                                 const bool force_to_xmod
20311                          )
20312 {
20313     /* If the text at the current parse position '*p' is a '(?#...)' comment,
20314      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20315      * is /x whitespace, advance '*p' so that on exit it points to the first
20316      * byte past all such white space and comments */
20317
20318     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20319
20320     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20321
20322     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20323
20324     for (;;) {
20325         if (RExC_end - (*p) >= 3
20326             && *(*p)     == '('
20327             && *(*p + 1) == '?'
20328             && *(*p + 2) == '#')
20329         {
20330             while (*(*p) != ')') {
20331                 if ((*p) == RExC_end)
20332                     FAIL("Sequence (?#... not terminated");
20333                 (*p)++;
20334             }
20335             (*p)++;
20336             continue;
20337         }
20338
20339         if (use_xmod) {
20340             const char * save_p = *p;
20341             while ((*p) < RExC_end) {
20342                 STRLEN len;
20343                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20344                     (*p) += len;
20345                 }
20346                 else if (*(*p) == '#') {
20347                     (*p) = reg_skipcomment(pRExC_state, (*p));
20348                 }
20349                 else {
20350                     break;
20351                 }
20352             }
20353             if (*p != save_p) {
20354                 continue;
20355             }
20356         }
20357
20358         break;
20359     }
20360
20361     return;
20362 }
20363
20364 /* nextchar()
20365
20366    Advances the parse position by one byte, unless that byte is the beginning
20367    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
20368    those two cases, the parse position is advanced beyond all such comments and
20369    white space.
20370
20371    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20372 */
20373
20374 STATIC void
20375 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20376 {
20377     PERL_ARGS_ASSERT_NEXTCHAR;
20378
20379     if (RExC_parse < RExC_end) {
20380         assert(   ! UTF
20381                || UTF8_IS_INVARIANT(*RExC_parse)
20382                || UTF8_IS_START(*RExC_parse));
20383
20384         RExC_parse += (UTF)
20385                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20386                       : 1;
20387
20388         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20389                                 FALSE /* Don't force /x */ );
20390     }
20391 }
20392
20393 STATIC void
20394 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20395 {
20396     /* 'size' is the delta number of smallest regnode equivalents to add or
20397      * subtract from the current memory allocated to the regex engine being
20398      * constructed. */
20399
20400     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20401
20402     RExC_size += size;
20403
20404     Renewc(RExC_rxi,
20405            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20406                                                 /* +1 for REG_MAGIC */
20407            char,
20408            regexp_internal);
20409     if ( RExC_rxi == NULL )
20410         FAIL("Regexp out of space");
20411     RXi_SET(RExC_rx, RExC_rxi);
20412
20413     RExC_emit_start = RExC_rxi->program;
20414     if (size > 0) {
20415         Zero(REGNODE_p(RExC_emit), size, regnode);
20416     }
20417
20418 #ifdef RE_TRACK_PATTERN_OFFSETS
20419     Renew(RExC_offsets, 2*RExC_size+1, U32);
20420     if (size > 0) {
20421         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20422     }
20423     RExC_offsets[0] = RExC_size;
20424 #endif
20425 }
20426
20427 STATIC regnode_offset
20428 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20429 {
20430     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20431      * equivalents space.  It aligns and increments RExC_size
20432      *
20433      * It returns the regnode's offset into the regex engine program */
20434
20435     const regnode_offset ret = RExC_emit;
20436
20437     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20438
20439     PERL_ARGS_ASSERT_REGNODE_GUTS;
20440
20441     SIZE_ALIGN(RExC_size);
20442     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20443     NODE_ALIGN_FILL(REGNODE_p(ret));
20444 #ifndef RE_TRACK_PATTERN_OFFSETS
20445     PERL_UNUSED_ARG(name);
20446     PERL_UNUSED_ARG(op);
20447 #else
20448     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20449
20450     if (RExC_offsets) {         /* MJD */
20451         MJD_OFFSET_DEBUG(
20452               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20453               name, __LINE__,
20454               PL_reg_name[op],
20455               (UV)(RExC_emit) > RExC_offsets[0]
20456                 ? "Overwriting end of array!\n" : "OK",
20457               (UV)(RExC_emit),
20458               (UV)(RExC_parse - RExC_start),
20459               (UV)RExC_offsets[0]));
20460         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20461     }
20462 #endif
20463     return(ret);
20464 }
20465
20466 /*
20467 - reg_node - emit a node
20468 */
20469 STATIC regnode_offset /* Location. */
20470 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20471 {
20472     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20473     regnode_offset ptr = ret;
20474
20475     PERL_ARGS_ASSERT_REG_NODE;
20476
20477     assert(regarglen[op] == 0);
20478
20479     FILL_ADVANCE_NODE(ptr, op);
20480     RExC_emit = ptr;
20481     return(ret);
20482 }
20483
20484 /*
20485 - reganode - emit a node with an argument
20486 */
20487 STATIC regnode_offset /* Location. */
20488 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20489 {
20490     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20491     regnode_offset ptr = ret;
20492
20493     PERL_ARGS_ASSERT_REGANODE;
20494
20495     /* ANYOF are special cased to allow non-length 1 args */
20496     assert(regarglen[op] == 1);
20497
20498     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20499     RExC_emit = ptr;
20500     return(ret);
20501 }
20502
20503 /*
20504 - regpnode - emit a temporary node with a SV* argument
20505 */
20506 STATIC regnode_offset /* Location. */
20507 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20508 {
20509     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20510     regnode_offset ptr = ret;
20511
20512     PERL_ARGS_ASSERT_REGPNODE;
20513
20514     FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20515     RExC_emit = ptr;
20516     return(ret);
20517 }
20518
20519 STATIC regnode_offset
20520 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20521 {
20522     /* emit a node with U32 and I32 arguments */
20523
20524     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20525     regnode_offset ptr = ret;
20526
20527     PERL_ARGS_ASSERT_REG2LANODE;
20528
20529     assert(regarglen[op] == 2);
20530
20531     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20532     RExC_emit = ptr;
20533     return(ret);
20534 }
20535
20536 /*
20537 - reginsert - insert an operator in front of already-emitted operand
20538 *
20539 * That means that on exit 'operand' is the offset of the newly inserted
20540 * operator, and the original operand has been relocated.
20541 *
20542 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20543 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20544 *
20545 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20546 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20547 *
20548 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20549 */
20550 STATIC void
20551 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20552                   const regnode_offset operand, const U32 depth)
20553 {
20554     regnode *src;
20555     regnode *dst;
20556     regnode *place;
20557     const int offset = regarglen[(U8)op];
20558     const int size = NODE_STEP_REGNODE + offset;
20559     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20560
20561     PERL_ARGS_ASSERT_REGINSERT;
20562     PERL_UNUSED_CONTEXT;
20563     PERL_UNUSED_ARG(depth);
20564 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20565     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20566     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20567                                     studying. If this is wrong then we need to adjust RExC_recurse
20568                                     below like we do with RExC_open_parens/RExC_close_parens. */
20569     change_engine_size(pRExC_state, (Ptrdiff_t) size);
20570     src = REGNODE_p(RExC_emit);
20571     RExC_emit += size;
20572     dst = REGNODE_p(RExC_emit);
20573
20574     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20575      * and [perl #133871] shows this can lead to problems, so skip this
20576      * realignment of parens until a later pass when they are reliable */
20577     if (! IN_PARENS_PASS && RExC_open_parens) {
20578         int paren;
20579         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20580         /* remember that RExC_npar is rex->nparens + 1,
20581          * iow it is 1 more than the number of parens seen in
20582          * the pattern so far. */
20583         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20584             /* note, RExC_open_parens[0] is the start of the
20585              * regex, it can't move. RExC_close_parens[0] is the end
20586              * of the regex, it *can* move. */
20587             if ( paren && RExC_open_parens[paren] >= operand ) {
20588                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20589                 RExC_open_parens[paren] += size;
20590             } else {
20591                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20592             }
20593             if ( RExC_close_parens[paren] >= operand ) {
20594                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20595                 RExC_close_parens[paren] += size;
20596             } else {
20597                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20598             }
20599         }
20600     }
20601     if (RExC_end_op)
20602         RExC_end_op += size;
20603
20604     while (src > REGNODE_p(operand)) {
20605         StructCopy(--src, --dst, regnode);
20606 #ifdef RE_TRACK_PATTERN_OFFSETS
20607         if (RExC_offsets) {     /* MJD 20010112 */
20608             MJD_OFFSET_DEBUG(
20609                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20610                   "reginsert",
20611                   __LINE__,
20612                   PL_reg_name[op],
20613                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20614                     ? "Overwriting end of array!\n" : "OK",
20615                   (UV)REGNODE_OFFSET(src),
20616                   (UV)REGNODE_OFFSET(dst),
20617                   (UV)RExC_offsets[0]));
20618             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20619             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20620         }
20621 #endif
20622     }
20623
20624     place = REGNODE_p(operand); /* Op node, where operand used to be. */
20625 #ifdef RE_TRACK_PATTERN_OFFSETS
20626     if (RExC_offsets) {         /* MJD */
20627         MJD_OFFSET_DEBUG(
20628               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20629               "reginsert",
20630               __LINE__,
20631               PL_reg_name[op],
20632               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20633               ? "Overwriting end of array!\n" : "OK",
20634               (UV)REGNODE_OFFSET(place),
20635               (UV)(RExC_parse - RExC_start),
20636               (UV)RExC_offsets[0]));
20637         Set_Node_Offset(place, RExC_parse);
20638         Set_Node_Length(place, 1);
20639     }
20640 #endif
20641     src = NEXTOPER(place);
20642     FLAGS(place) = 0;
20643     FILL_NODE(operand, op);
20644
20645     /* Zero out any arguments in the new node */
20646     Zero(src, offset, regnode);
20647 }
20648
20649 /*
20650 - regtail - set the next-pointer at the end of a node chain of p to val.  If
20651             that value won't fit in the space available, instead returns FALSE.
20652             (Except asserts if we can't fit in the largest space the regex
20653             engine is designed for.)
20654 - SEE ALSO: regtail_study
20655 */
20656 STATIC bool
20657 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20658                 const regnode_offset p,
20659                 const regnode_offset val,
20660                 const U32 depth)
20661 {
20662     regnode_offset scan;
20663     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20664
20665     PERL_ARGS_ASSERT_REGTAIL;
20666 #ifndef DEBUGGING
20667     PERL_UNUSED_ARG(depth);
20668 #endif
20669
20670     /* The final node in the chain is the first one with a nonzero next pointer
20671      * */
20672     scan = (regnode_offset) p;
20673     for (;;) {
20674         regnode * const temp = regnext(REGNODE_p(scan));
20675         DEBUG_PARSE_r({
20676             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20677             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20678             Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
20679                 SvPV_nolen_const(RExC_mysv), scan,
20680                     (temp == NULL ? "->" : ""),
20681                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20682             );
20683         });
20684         if (temp == NULL)
20685             break;
20686         scan = REGNODE_OFFSET(temp);
20687     }
20688
20689     /* Populate this node's next pointer */
20690     assert(val >= scan);
20691     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20692         assert((UV) (val - scan) <= U32_MAX);
20693         ARG_SET(REGNODE_p(scan), val - scan);
20694     }
20695     else {
20696         if (val - scan > U16_MAX) {
20697             /* Populate this with something that won't loop and will likely
20698              * lead to a crash if the caller ignores the failure return, and
20699              * execution continues */
20700             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20701             return FALSE;
20702         }
20703         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20704     }
20705
20706     return TRUE;
20707 }
20708
20709 #ifdef DEBUGGING
20710 /*
20711 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20712 - Look for optimizable sequences at the same time.
20713 - currently only looks for EXACT chains.
20714
20715 This is experimental code. The idea is to use this routine to perform
20716 in place optimizations on branches and groups as they are constructed,
20717 with the long term intention of removing optimization from study_chunk so
20718 that it is purely analytical.
20719
20720 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20721 to control which is which.
20722
20723 This used to return a value that was ignored.  It was a problem that it is
20724 #ifdef'd to be another function that didn't return a value.  khw has changed it
20725 so both currently return a pass/fail return.
20726
20727 */
20728 /* TODO: All four parms should be const */
20729
20730 STATIC bool
20731 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20732                       const regnode_offset val, U32 depth)
20733 {
20734     regnode_offset scan;
20735     U8 exact = PSEUDO;
20736 #ifdef EXPERIMENTAL_INPLACESCAN
20737     I32 min = 0;
20738 #endif
20739     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20740
20741     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20742
20743
20744     /* Find last node. */
20745
20746     scan = p;
20747     for (;;) {
20748         regnode * const temp = regnext(REGNODE_p(scan));
20749 #ifdef EXPERIMENTAL_INPLACESCAN
20750         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20751             bool unfolded_multi_char;   /* Unexamined in this routine */
20752             if (join_exact(pRExC_state, scan, &min,
20753                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20754                 return TRUE; /* Was return EXACT */
20755         }
20756 #endif
20757         if ( exact ) {
20758             switch (OP(REGNODE_p(scan))) {
20759                 case LEXACT:
20760                 case EXACT:
20761                 case LEXACT_REQ8:
20762                 case EXACT_REQ8:
20763                 case EXACTL:
20764                 case EXACTF:
20765                 case EXACTFU_S_EDGE:
20766                 case EXACTFAA_NO_TRIE:
20767                 case EXACTFAA:
20768                 case EXACTFU:
20769                 case EXACTFU_REQ8:
20770                 case EXACTFLU8:
20771                 case EXACTFUP:
20772                 case EXACTFL:
20773                         if( exact == PSEUDO )
20774                             exact= OP(REGNODE_p(scan));
20775                         else if ( exact != OP(REGNODE_p(scan)) )
20776                             exact= 0;
20777                 case NOTHING:
20778                     break;
20779                 default:
20780                     exact= 0;
20781             }
20782         }
20783         DEBUG_PARSE_r({
20784             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20785             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20786             Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
20787                 SvPV_nolen_const(RExC_mysv),
20788                 scan,
20789                 PL_reg_name[exact]);
20790         });
20791         if (temp == NULL)
20792             break;
20793         scan = REGNODE_OFFSET(temp);
20794     }
20795     DEBUG_PARSE_r({
20796         DEBUG_PARSE_MSG("");
20797         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20798         Perl_re_printf( aTHX_
20799                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20800                       SvPV_nolen_const(RExC_mysv),
20801                       (IV)val,
20802                       (IV)(val - scan)
20803         );
20804     });
20805     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20806         assert((UV) (val - scan) <= U32_MAX);
20807         ARG_SET(REGNODE_p(scan), val - scan);
20808     }
20809     else {
20810         if (val - scan > U16_MAX) {
20811             /* Populate this with something that won't loop and will likely
20812              * lead to a crash if the caller ignores the failure return, and
20813              * execution continues */
20814             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20815             return FALSE;
20816         }
20817         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20818     }
20819
20820     return TRUE; /* Was 'return exact' */
20821 }
20822 #endif
20823
20824 STATIC SV*
20825 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20826
20827     /* Returns an inversion list of all the code points matched by the
20828      * ANYOFM/NANYOFM node 'n' */
20829
20830     SV * cp_list = _new_invlist(-1);
20831     const U8 lowest = (U8) ARG(n);
20832     unsigned int i;
20833     U8 count = 0;
20834     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20835
20836     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20837
20838     /* Starting with the lowest code point, any code point that ANDed with the
20839      * mask yields the lowest code point is in the set */
20840     for (i = lowest; i <= 0xFF; i++) {
20841         if ((i & FLAGS(n)) == ARG(n)) {
20842             cp_list = add_cp_to_invlist(cp_list, i);
20843             count++;
20844
20845             /* We know how many code points (a power of two) that are in the
20846              * set.  No use looking once we've got that number */
20847             if (count >= needed) break;
20848         }
20849     }
20850
20851     if (OP(n) == NANYOFM) {
20852         _invlist_invert(cp_list);
20853     }
20854     return cp_list;
20855 }
20856
20857 /*
20858  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20859  */
20860 #ifdef DEBUGGING
20861
20862 static void
20863 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20864 {
20865     int bit;
20866     int set=0;
20867
20868     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20869
20870     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20871         if (flags & (1<<bit)) {
20872             if (!set++ && lead)
20873                 Perl_re_printf( aTHX_  "%s", lead);
20874             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20875         }
20876     }
20877     if (lead)  {
20878         if (set)
20879             Perl_re_printf( aTHX_  "\n");
20880         else
20881             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20882     }
20883 }
20884
20885 static void
20886 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20887 {
20888     int bit;
20889     int set=0;
20890     regex_charset cs;
20891
20892     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20893
20894     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20895         if (flags & (1<<bit)) {
20896             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
20897                 continue;
20898             }
20899             if (!set++ && lead)
20900                 Perl_re_printf( aTHX_  "%s", lead);
20901             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20902         }
20903     }
20904     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20905             if (!set++ && lead) {
20906                 Perl_re_printf( aTHX_  "%s", lead);
20907             }
20908             switch (cs) {
20909                 case REGEX_UNICODE_CHARSET:
20910                     Perl_re_printf( aTHX_  "UNICODE");
20911                     break;
20912                 case REGEX_LOCALE_CHARSET:
20913                     Perl_re_printf( aTHX_  "LOCALE");
20914                     break;
20915                 case REGEX_ASCII_RESTRICTED_CHARSET:
20916                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20917                     break;
20918                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20919                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20920                     break;
20921                 default:
20922                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20923                     break;
20924             }
20925     }
20926     if (lead)  {
20927         if (set)
20928             Perl_re_printf( aTHX_  "\n");
20929         else
20930             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20931     }
20932 }
20933 #endif
20934
20935 void
20936 Perl_regdump(pTHX_ const regexp *r)
20937 {
20938 #ifdef DEBUGGING
20939     int i;
20940     SV * const sv = sv_newmortal();
20941     SV *dsv= sv_newmortal();
20942     RXi_GET_DECL(r, ri);
20943     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20944
20945     PERL_ARGS_ASSERT_REGDUMP;
20946
20947     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20948
20949     /* Header fields of interest. */
20950     for (i = 0; i < 2; i++) {
20951         if (r->substrs->data[i].substr) {
20952             RE_PV_QUOTED_DECL(s, 0, dsv,
20953                             SvPVX_const(r->substrs->data[i].substr),
20954                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20955                             PL_dump_re_max_len);
20956             Perl_re_printf( aTHX_
20957                           "%s %s%s at %" IVdf "..%" UVuf " ",
20958                           i ? "floating" : "anchored",
20959                           s,
20960                           RE_SV_TAIL(r->substrs->data[i].substr),
20961                           (IV)r->substrs->data[i].min_offset,
20962                           (UV)r->substrs->data[i].max_offset);
20963         }
20964         else if (r->substrs->data[i].utf8_substr) {
20965             RE_PV_QUOTED_DECL(s, 1, dsv,
20966                             SvPVX_const(r->substrs->data[i].utf8_substr),
20967                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20968                             30);
20969             Perl_re_printf( aTHX_
20970                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20971                           i ? "floating" : "anchored",
20972                           s,
20973                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20974                           (IV)r->substrs->data[i].min_offset,
20975                           (UV)r->substrs->data[i].max_offset);
20976         }
20977     }
20978
20979     if (r->check_substr || r->check_utf8)
20980         Perl_re_printf( aTHX_
20981                       (const char *)
20982                       (   r->check_substr == r->substrs->data[1].substr
20983                        && r->check_utf8   == r->substrs->data[1].utf8_substr
20984                        ? "(checking floating" : "(checking anchored"));
20985     if (r->intflags & PREGf_NOSCAN)
20986         Perl_re_printf( aTHX_  " noscan");
20987     if (r->extflags & RXf_CHECK_ALL)
20988         Perl_re_printf( aTHX_  " isall");
20989     if (r->check_substr || r->check_utf8)
20990         Perl_re_printf( aTHX_  ") ");
20991
20992     if (ri->regstclass) {
20993         regprop(r, sv, ri->regstclass, NULL, NULL);
20994         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
20995     }
20996     if (r->intflags & PREGf_ANCH) {
20997         Perl_re_printf( aTHX_  "anchored");
20998         if (r->intflags & PREGf_ANCH_MBOL)
20999             Perl_re_printf( aTHX_  "(MBOL)");
21000         if (r->intflags & PREGf_ANCH_SBOL)
21001             Perl_re_printf( aTHX_  "(SBOL)");
21002         if (r->intflags & PREGf_ANCH_GPOS)
21003             Perl_re_printf( aTHX_  "(GPOS)");
21004         Perl_re_printf( aTHX_ " ");
21005     }
21006     if (r->intflags & PREGf_GPOS_SEEN)
21007         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
21008     if (r->intflags & PREGf_SKIP)
21009         Perl_re_printf( aTHX_  "plus ");
21010     if (r->intflags & PREGf_IMPLICIT)
21011         Perl_re_printf( aTHX_  "implicit ");
21012     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
21013     if (r->extflags & RXf_EVAL_SEEN)
21014         Perl_re_printf( aTHX_  "with eval ");
21015     Perl_re_printf( aTHX_  "\n");
21016     DEBUG_FLAGS_r({
21017         regdump_extflags("r->extflags: ", r->extflags);
21018         regdump_intflags("r->intflags: ", r->intflags);
21019     });
21020 #else
21021     PERL_ARGS_ASSERT_REGDUMP;
21022     PERL_UNUSED_CONTEXT;
21023     PERL_UNUSED_ARG(r);
21024 #endif  /* DEBUGGING */
21025 }
21026
21027 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21028 #ifdef DEBUGGING
21029
21030 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
21031      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
21032      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
21033      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
21034      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
21035      || _CC_VERTSPACE != 15
21036 #   error Need to adjust order of anyofs[]
21037 #  endif
21038 static const char * const anyofs[] = {
21039     "\\w",
21040     "\\W",
21041     "\\d",
21042     "\\D",
21043     "[:alpha:]",
21044     "[:^alpha:]",
21045     "[:lower:]",
21046     "[:^lower:]",
21047     "[:upper:]",
21048     "[:^upper:]",
21049     "[:punct:]",
21050     "[:^punct:]",
21051     "[:print:]",
21052     "[:^print:]",
21053     "[:alnum:]",
21054     "[:^alnum:]",
21055     "[:graph:]",
21056     "[:^graph:]",
21057     "[:cased:]",
21058     "[:^cased:]",
21059     "\\s",
21060     "\\S",
21061     "[:blank:]",
21062     "[:^blank:]",
21063     "[:xdigit:]",
21064     "[:^xdigit:]",
21065     "[:cntrl:]",
21066     "[:^cntrl:]",
21067     "[:ascii:]",
21068     "[:^ascii:]",
21069     "\\v",
21070     "\\V"
21071 };
21072 #endif
21073
21074 /*
21075 - regprop - printable representation of opcode, with run time support
21076 */
21077
21078 void
21079 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21080 {
21081 #ifdef DEBUGGING
21082     int k;
21083     RXi_GET_DECL(prog, progi);
21084     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21085
21086     PERL_ARGS_ASSERT_REGPROP;
21087
21088     SvPVCLEAR(sv);
21089
21090     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
21091         if (pRExC_state) {  /* This gives more info, if we have it */
21092             FAIL3("panic: corrupted regexp opcode %d > %d",
21093                   (int)OP(o), (int)REGNODE_MAX);
21094         }
21095         else {
21096             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21097                              (int)OP(o), (int)REGNODE_MAX);
21098         }
21099     }
21100     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21101
21102     k = PL_regkind[OP(o)];
21103
21104     if (k == EXACT) {
21105         sv_catpvs(sv, " ");
21106         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21107          * is a crude hack but it may be the best for now since
21108          * we have no flag "this EXACTish node was UTF-8"
21109          * --jhi */
21110         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21111                   PL_colors[0], PL_colors[1],
21112                   PERL_PV_ESCAPE_UNI_DETECT |
21113                   PERL_PV_ESCAPE_NONASCII   |
21114                   PERL_PV_PRETTY_ELLIPSES   |
21115                   PERL_PV_PRETTY_LTGT       |
21116                   PERL_PV_PRETTY_NOCLEAR
21117                   );
21118     } else if (k == TRIE) {
21119         /* print the details of the trie in dumpuntil instead, as
21120          * progi->data isn't available here */
21121         const char op = OP(o);
21122         const U32 n = ARG(o);
21123         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21124                (reg_ac_data *)progi->data->data[n] :
21125                NULL;
21126         const reg_trie_data * const trie
21127             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21128
21129         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21130         DEBUG_TRIE_COMPILE_r({
21131           if (trie->jump)
21132             sv_catpvs(sv, "(JUMP)");
21133           Perl_sv_catpvf(aTHX_ sv,
21134             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21135             (UV)trie->startstate,
21136             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21137             (UV)trie->wordcount,
21138             (UV)trie->minlen,
21139             (UV)trie->maxlen,
21140             (UV)TRIE_CHARCOUNT(trie),
21141             (UV)trie->uniquecharcount
21142           );
21143         });
21144         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21145             sv_catpvs(sv, "[");
21146             (void) put_charclass_bitmap_innards(sv,
21147                                                 ((IS_ANYOF_TRIE(op))
21148                                                  ? ANYOF_BITMAP(o)
21149                                                  : TRIE_BITMAP(trie)),
21150                                                 NULL,
21151                                                 NULL,
21152                                                 NULL,
21153                                                 0,
21154                                                 FALSE
21155                                                );
21156             sv_catpvs(sv, "]");
21157         }
21158     } else if (k == CURLY) {
21159         U32 lo = ARG1(o), hi = ARG2(o);
21160         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21161             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21162         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21163         if (hi == REG_INFTY)
21164             sv_catpvs(sv, "INFTY");
21165         else
21166             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21167         sv_catpvs(sv, "}");
21168     }
21169     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
21170         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21171     else if (k == REF || k == OPEN || k == CLOSE
21172              || k == GROUPP || OP(o)==ACCEPT)
21173     {
21174         AV *name_list= NULL;
21175         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21176         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
21177         if ( RXp_PAREN_NAMES(prog) ) {
21178             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21179         } else if ( pRExC_state ) {
21180             name_list= RExC_paren_name_list;
21181         }
21182         if (name_list) {
21183             if ( k != REF || (OP(o) < REFN)) {
21184                 SV **name= av_fetch(name_list, parno, 0 );
21185                 if (name)
21186                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21187             }
21188             else {
21189                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21190                 I32 *nums=(I32*)SvPVX(sv_dat);
21191                 SV **name= av_fetch(name_list, nums[0], 0 );
21192                 I32 n;
21193                 if (name) {
21194                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
21195                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21196                                     (n ? "," : ""), (IV)nums[n]);
21197                     }
21198                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21199                 }
21200             }
21201         }
21202         if ( k == REF && reginfo) {
21203             U32 n = ARG(o);  /* which paren pair */
21204             I32 ln = prog->offs[n].start;
21205             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21206                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21207             else if (ln == prog->offs[n].end)
21208                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21209             else {
21210                 const char *s = reginfo->strbeg + ln;
21211                 Perl_sv_catpvf(aTHX_ sv, ": ");
21212                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21213                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21214             }
21215         }
21216     } else if (k == GOSUB) {
21217         AV *name_list= NULL;
21218         if ( RXp_PAREN_NAMES(prog) ) {
21219             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21220         } else if ( pRExC_state ) {
21221             name_list= RExC_paren_name_list;
21222         }
21223
21224         /* Paren and offset */
21225         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21226                 (int)((o + (int)ARG2L(o)) - progi->program) );
21227         if (name_list) {
21228             SV **name= av_fetch(name_list, ARG(o), 0 );
21229             if (name)
21230                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21231         }
21232     }
21233     else if (k == LOGICAL)
21234         /* 2: embedded, otherwise 1 */
21235         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21236     else if (k == ANYOF || k == ANYOFR) {
21237         U8 flags;
21238         char * bitmap;
21239         U32 arg;
21240         bool do_sep = FALSE;    /* Do we need to separate various components of
21241                                    the output? */
21242         /* Set if there is still an unresolved user-defined property */
21243         SV *unresolved                = NULL;
21244
21245         /* Things that are ignored except when the runtime locale is UTF-8 */
21246         SV *only_utf8_locale_invlist = NULL;
21247
21248         /* Code points that don't fit in the bitmap */
21249         SV *nonbitmap_invlist = NULL;
21250
21251         /* And things that aren't in the bitmap, but are small enough to be */
21252         SV* bitmap_range_not_in_bitmap = NULL;
21253
21254         bool inverted;
21255
21256         if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21257             flags = 0;
21258             bitmap = NULL;
21259             arg = 0;
21260         }
21261         else {
21262             flags = ANYOF_FLAGS(o);
21263             bitmap = ANYOF_BITMAP(o);
21264             arg = ARG(o);
21265         }
21266
21267         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21268             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21269                 sv_catpvs(sv, "{utf8-locale-reqd}");
21270             }
21271             if (flags & ANYOFL_FOLD) {
21272                 sv_catpvs(sv, "{i}");
21273             }
21274         }
21275
21276         inverted = flags & ANYOF_INVERT;
21277
21278         /* If there is stuff outside the bitmap, get it */
21279         if (arg != ANYOF_ONLY_HAS_BITMAP) {
21280             if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21281                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21282                                             ANYOFRbase(o),
21283                                             ANYOFRbase(o) + ANYOFRdelta(o));
21284             }
21285             else {
21286 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21287                 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21288                                                 &unresolved,
21289                                                 &only_utf8_locale_invlist,
21290                                                 &nonbitmap_invlist);
21291 #else
21292                 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21293                                                 &unresolved,
21294                                                 &only_utf8_locale_invlist,
21295                                                 &nonbitmap_invlist);
21296 #endif
21297             }
21298
21299             /* The non-bitmap data may contain stuff that could fit in the
21300              * bitmap.  This could come from a user-defined property being
21301              * finally resolved when this call was done; or much more likely
21302              * because there are matches that require UTF-8 to be valid, and so
21303              * aren't in the bitmap (or ANYOFR).  This is teased apart later */
21304             _invlist_intersection(nonbitmap_invlist,
21305                                   PL_InBitmap,
21306                                   &bitmap_range_not_in_bitmap);
21307             /* Leave just the things that don't fit into the bitmap */
21308             _invlist_subtract(nonbitmap_invlist,
21309                               PL_InBitmap,
21310                               &nonbitmap_invlist);
21311         }
21312
21313         /* Obey this flag to add all above-the-bitmap code points */
21314         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21315             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21316                                                       NUM_ANYOF_CODE_POINTS,
21317                                                       UV_MAX);
21318         }
21319
21320         /* Ready to start outputting.  First, the initial left bracket */
21321         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21322
21323         /* ANYOFH by definition doesn't have anything that will fit inside the
21324          * bitmap;  ANYOFR may or may not. */
21325         if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21326             && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21327                 ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21328         {
21329             /* Then all the things that could fit in the bitmap */
21330             do_sep = put_charclass_bitmap_innards(sv,
21331                                                   bitmap,
21332                                                   bitmap_range_not_in_bitmap,
21333                                                   only_utf8_locale_invlist,
21334                                                   o,
21335                                                   flags,
21336
21337                                                   /* Can't try inverting for a
21338                                                    * better display if there
21339                                                    * are things that haven't
21340                                                    * been resolved */
21341                                                   unresolved != NULL
21342                                             || inRANGE(OP(o), ANYOFR, ANYOFRb));
21343             SvREFCNT_dec(bitmap_range_not_in_bitmap);
21344
21345             /* If there are user-defined properties which haven't been defined
21346              * yet, output them.  If the result is not to be inverted, it is
21347              * clearest to output them in a separate [] from the bitmap range
21348              * stuff.  If the result is to be complemented, we have to show
21349              * everything in one [], as the inversion applies to the whole
21350              * thing.  Use {braces} to separate them from anything in the
21351              * bitmap and anything above the bitmap. */
21352             if (unresolved) {
21353                 if (inverted) {
21354                     if (! do_sep) { /* If didn't output anything in the bitmap
21355                                      */
21356                         sv_catpvs(sv, "^");
21357                     }
21358                     sv_catpvs(sv, "{");
21359                 }
21360                 else if (do_sep) {
21361                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21362                                                       PL_colors[0]);
21363                 }
21364                 sv_catsv(sv, unresolved);
21365                 if (inverted) {
21366                     sv_catpvs(sv, "}");
21367                 }
21368                 do_sep = ! inverted;
21369             }
21370         }
21371
21372         /* And, finally, add the above-the-bitmap stuff */
21373         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21374             SV* contents;
21375
21376             /* See if truncation size is overridden */
21377             const STRLEN dump_len = (PL_dump_re_max_len > 256)
21378                                     ? PL_dump_re_max_len
21379                                     : 256;
21380
21381             /* This is output in a separate [] */
21382             if (do_sep) {
21383                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21384             }
21385
21386             /* And, for easy of understanding, it is shown in the
21387              * uncomplemented form if possible.  The one exception being if
21388              * there are unresolved items, where the inversion has to be
21389              * delayed until runtime */
21390             if (inverted && ! unresolved) {
21391                 _invlist_invert(nonbitmap_invlist);
21392                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21393             }
21394
21395             contents = invlist_contents(nonbitmap_invlist,
21396                                         FALSE /* output suitable for catsv */
21397                                        );
21398
21399             /* If the output is shorter than the permissible maximum, just do it. */
21400             if (SvCUR(contents) <= dump_len) {
21401                 sv_catsv(sv, contents);
21402             }
21403             else {
21404                 const char * contents_string = SvPVX(contents);
21405                 STRLEN i = dump_len;
21406
21407                 /* Otherwise, start at the permissible max and work back to the
21408                  * first break possibility */
21409                 while (i > 0 && contents_string[i] != ' ') {
21410                     i--;
21411                 }
21412                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
21413                                        find a legal break */
21414                     i = dump_len;
21415                 }
21416
21417                 sv_catpvn(sv, contents_string, i);
21418                 sv_catpvs(sv, "...");
21419             }
21420
21421             SvREFCNT_dec_NN(contents);
21422             SvREFCNT_dec_NN(nonbitmap_invlist);
21423         }
21424
21425         /* And finally the matching, closing ']' */
21426         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21427
21428         if (OP(o) == ANYOFHs) {
21429             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21430         }
21431         else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21432             U8 lowest = (OP(o) != ANYOFHr)
21433                          ? FLAGS(o)
21434                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21435             U8 highest = (OP(o) == ANYOFHr)
21436                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21437                          : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21438                            ? 0xFF
21439                            : lowest;
21440 #ifndef EBCDIC
21441             if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21442 #endif
21443             {
21444                 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21445                 if (lowest != highest) {
21446                     Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21447                 }
21448                 Perl_sv_catpvf(aTHX_ sv, ")");
21449             }
21450         }
21451
21452         SvREFCNT_dec(unresolved);
21453     }
21454     else if (k == ANYOFM) {
21455         SV * cp_list = get_ANYOFM_contents(o);
21456
21457         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21458         if (OP(o) == NANYOFM) {
21459             _invlist_invert(cp_list);
21460         }
21461
21462         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21463         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21464
21465         SvREFCNT_dec(cp_list);
21466     }
21467     else if (k == POSIXD || k == NPOSIXD) {
21468         U8 index = FLAGS(o) * 2;
21469         if (index < C_ARRAY_LENGTH(anyofs)) {
21470             if (*anyofs[index] != '[')  {
21471                 sv_catpvs(sv, "[");
21472             }
21473             sv_catpv(sv, anyofs[index]);
21474             if (*anyofs[index] != '[')  {
21475                 sv_catpvs(sv, "]");
21476             }
21477         }
21478         else {
21479             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21480         }
21481     }
21482     else if (k == BOUND || k == NBOUND) {
21483         /* Must be synced with order of 'bound_type' in regcomp.h */
21484         const char * const bounds[] = {
21485             "",      /* Traditional */
21486             "{gcb}",
21487             "{lb}",
21488             "{sb}",
21489             "{wb}"
21490         };
21491         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21492         sv_catpv(sv, bounds[FLAGS(o)]);
21493     }
21494     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21495         Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21496         if (o->next_off) {
21497             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21498         }
21499         Perl_sv_catpvf(aTHX_ sv, "]");
21500     }
21501     else if (OP(o) == SBOL)
21502         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21503
21504     /* add on the verb argument if there is one */
21505     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21506         if ( ARG(o) )
21507             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21508                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21509         else
21510             sv_catpvs(sv, ":NULL");
21511     }
21512 #else
21513     PERL_UNUSED_CONTEXT;
21514     PERL_UNUSED_ARG(sv);
21515     PERL_UNUSED_ARG(o);
21516     PERL_UNUSED_ARG(prog);
21517     PERL_UNUSED_ARG(reginfo);
21518     PERL_UNUSED_ARG(pRExC_state);
21519 #endif  /* DEBUGGING */
21520 }
21521
21522
21523
21524 SV *
21525 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21526 {                               /* Assume that RE_INTUIT is set */
21527     /* Returns an SV containing a string that must appear in the target for it
21528      * to match, or NULL if nothing is known that must match.
21529      *
21530      * CAUTION: the SV can be freed during execution of the regex engine */
21531
21532     struct regexp *const prog = ReANY(r);
21533     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21534
21535     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21536     PERL_UNUSED_CONTEXT;
21537
21538     DEBUG_COMPILE_r(
21539         {
21540             if (prog->maxlen > 0) {
21541                 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21542                       ? prog->check_utf8 : prog->check_substr);
21543
21544                 if (!PL_colorset) reginitcolors();
21545                 Perl_re_printf( aTHX_
21546                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21547                       PL_colors[4],
21548                       RX_UTF8(r) ? "utf8 " : "",
21549                       PL_colors[5], PL_colors[0],
21550                       s,
21551                       PL_colors[1],
21552                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21553             }
21554         } );
21555
21556     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21557     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21558 }
21559
21560 /*
21561    pregfree()
21562
21563    handles refcounting and freeing the perl core regexp structure. When
21564    it is necessary to actually free the structure the first thing it
21565    does is call the 'free' method of the regexp_engine associated to
21566    the regexp, allowing the handling of the void *pprivate; member
21567    first. (This routine is not overridable by extensions, which is why
21568    the extensions free is called first.)
21569
21570    See regdupe and regdupe_internal if you change anything here.
21571 */
21572 #ifndef PERL_IN_XSUB_RE
21573 void
21574 Perl_pregfree(pTHX_ REGEXP *r)
21575 {
21576     SvREFCNT_dec(r);
21577 }
21578
21579 void
21580 Perl_pregfree2(pTHX_ REGEXP *rx)
21581 {
21582     struct regexp *const r = ReANY(rx);
21583     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21584
21585     PERL_ARGS_ASSERT_PREGFREE2;
21586
21587     if (! r)
21588         return;
21589
21590     if (r->mother_re) {
21591         ReREFCNT_dec(r->mother_re);
21592     } else {
21593         CALLREGFREE_PVT(rx); /* free the private data */
21594         SvREFCNT_dec(RXp_PAREN_NAMES(r));
21595     }
21596     if (r->substrs) {
21597         int i;
21598         for (i = 0; i < 2; i++) {
21599             SvREFCNT_dec(r->substrs->data[i].substr);
21600             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21601         }
21602         Safefree(r->substrs);
21603     }
21604     RX_MATCH_COPY_FREE(rx);
21605 #ifdef PERL_ANY_COW
21606     SvREFCNT_dec(r->saved_copy);
21607 #endif
21608     Safefree(r->offs);
21609     SvREFCNT_dec(r->qr_anoncv);
21610     if (r->recurse_locinput)
21611         Safefree(r->recurse_locinput);
21612 }
21613
21614
21615 /*  reg_temp_copy()
21616
21617     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21618     except that dsv will be created if NULL.
21619
21620     This function is used in two main ways. First to implement
21621         $r = qr/....; $s = $$r;
21622
21623     Secondly, it is used as a hacky workaround to the structural issue of
21624     match results
21625     being stored in the regexp structure which is in turn stored in
21626     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21627     could be PL_curpm in multiple contexts, and could require multiple
21628     result sets being associated with the pattern simultaneously, such
21629     as when doing a recursive match with (??{$qr})
21630
21631     The solution is to make a lightweight copy of the regexp structure
21632     when a qr// is returned from the code executed by (??{$qr}) this
21633     lightweight copy doesn't actually own any of its data except for
21634     the starp/end and the actual regexp structure itself.
21635
21636 */
21637
21638
21639 REGEXP *
21640 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21641 {
21642     struct regexp *drx;
21643     struct regexp *const srx = ReANY(ssv);
21644     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21645
21646     PERL_ARGS_ASSERT_REG_TEMP_COPY;
21647
21648     if (!dsv)
21649         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21650     else {
21651         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21652
21653         /* our only valid caller, sv_setsv_flags(), should have done
21654          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21655         assert(!SvOOK(dsv));
21656         assert(!SvIsCOW(dsv));
21657         assert(!SvROK(dsv));
21658
21659         if (SvPVX_const(dsv)) {
21660             if (SvLEN(dsv))
21661                 Safefree(SvPVX(dsv));
21662             SvPVX(dsv) = NULL;
21663         }
21664         SvLEN_set(dsv, 0);
21665         SvCUR_set(dsv, 0);
21666         SvOK_off((SV *)dsv);
21667
21668         if (islv) {
21669             /* For PVLVs, the head (sv_any) points to an XPVLV, while
21670              * the LV's xpvlenu_rx will point to a regexp body, which
21671              * we allocate here */
21672             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21673             assert(!SvPVX(dsv));
21674             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21675             temp->sv_any = NULL;
21676             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21677             SvREFCNT_dec_NN(temp);
21678             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21679                ing below will not set it. */
21680             SvCUR_set(dsv, SvCUR(ssv));
21681         }
21682     }
21683     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21684        sv_force_normal(sv) is called.  */
21685     SvFAKE_on(dsv);
21686     drx = ReANY(dsv);
21687
21688     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21689     SvPV_set(dsv, RX_WRAPPED(ssv));
21690     /* We share the same string buffer as the original regexp, on which we
21691        hold a reference count, incremented when mother_re is set below.
21692        The string pointer is copied here, being part of the regexp struct.
21693      */
21694     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21695            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21696     if (!islv)
21697         SvLEN_set(dsv, 0);
21698     if (srx->offs) {
21699         const I32 npar = srx->nparens+1;
21700         Newx(drx->offs, npar, regexp_paren_pair);
21701         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21702     }
21703     if (srx->substrs) {
21704         int i;
21705         Newx(drx->substrs, 1, struct reg_substr_data);
21706         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21707
21708         for (i = 0; i < 2; i++) {
21709             SvREFCNT_inc_void(drx->substrs->data[i].substr);
21710             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21711         }
21712
21713         /* check_substr and check_utf8, if non-NULL, point to either their
21714            anchored or float namesakes, and don't hold a second reference.  */
21715     }
21716     RX_MATCH_COPIED_off(dsv);
21717 #ifdef PERL_ANY_COW
21718     drx->saved_copy = NULL;
21719 #endif
21720     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21721     SvREFCNT_inc_void(drx->qr_anoncv);
21722     if (srx->recurse_locinput)
21723         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21724
21725     return dsv;
21726 }
21727 #endif
21728
21729
21730 /* regfree_internal()
21731
21732    Free the private data in a regexp. This is overloadable by
21733    extensions. Perl takes care of the regexp structure in pregfree(),
21734    this covers the *pprivate pointer which technically perl doesn't
21735    know about, however of course we have to handle the
21736    regexp_internal structure when no extension is in use.
21737
21738    Note this is called before freeing anything in the regexp
21739    structure.
21740  */
21741
21742 void
21743 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21744 {
21745     struct regexp *const r = ReANY(rx);
21746     RXi_GET_DECL(r, ri);
21747     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21748
21749     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21750
21751     if (! ri) {
21752         return;
21753     }
21754
21755     DEBUG_COMPILE_r({
21756         if (!PL_colorset)
21757             reginitcolors();
21758         {
21759             SV *dsv= sv_newmortal();
21760             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21761                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21762             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21763                 PL_colors[4], PL_colors[5], s);
21764         }
21765     });
21766
21767 #ifdef RE_TRACK_PATTERN_OFFSETS
21768     if (ri->u.offsets)
21769         Safefree(ri->u.offsets);             /* 20010421 MJD */
21770 #endif
21771     if (ri->code_blocks)
21772         S_free_codeblocks(aTHX_ ri->code_blocks);
21773
21774     if (ri->data) {
21775         int n = ri->data->count;
21776
21777         while (--n >= 0) {
21778           /* If you add a ->what type here, update the comment in regcomp.h */
21779             switch (ri->data->what[n]) {
21780             case 'a':
21781             case 'r':
21782             case 's':
21783             case 'S':
21784             case 'u':
21785                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21786                 break;
21787             case 'f':
21788                 Safefree(ri->data->data[n]);
21789                 break;
21790             case 'l':
21791             case 'L':
21792                 break;
21793             case 'T':
21794                 { /* Aho Corasick add-on structure for a trie node.
21795                      Used in stclass optimization only */
21796                     U32 refcount;
21797                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21798 #ifdef USE_ITHREADS
21799 #endif
21800                     OP_REFCNT_LOCK;
21801                     refcount = --aho->refcount;
21802                     OP_REFCNT_UNLOCK;
21803                     if ( !refcount ) {
21804                         PerlMemShared_free(aho->states);
21805                         PerlMemShared_free(aho->fail);
21806                          /* do this last!!!! */
21807                         PerlMemShared_free(ri->data->data[n]);
21808                         /* we should only ever get called once, so
21809                          * assert as much, and also guard the free
21810                          * which /might/ happen twice. At the least
21811                          * it will make code anlyzers happy and it
21812                          * doesn't cost much. - Yves */
21813                         assert(ri->regstclass);
21814                         if (ri->regstclass) {
21815                             PerlMemShared_free(ri->regstclass);
21816                             ri->regstclass = 0;
21817                         }
21818                     }
21819                 }
21820                 break;
21821             case 't':
21822                 {
21823                     /* trie structure. */
21824                     U32 refcount;
21825                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21826 #ifdef USE_ITHREADS
21827 #endif
21828                     OP_REFCNT_LOCK;
21829                     refcount = --trie->refcount;
21830                     OP_REFCNT_UNLOCK;
21831                     if ( !refcount ) {
21832                         PerlMemShared_free(trie->charmap);
21833                         PerlMemShared_free(trie->states);
21834                         PerlMemShared_free(trie->trans);
21835                         if (trie->bitmap)
21836                             PerlMemShared_free(trie->bitmap);
21837                         if (trie->jump)
21838                             PerlMemShared_free(trie->jump);
21839                         PerlMemShared_free(trie->wordinfo);
21840                         /* do this last!!!! */
21841                         PerlMemShared_free(ri->data->data[n]);
21842                     }
21843                 }
21844                 break;
21845             default:
21846                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21847                                                     ri->data->what[n]);
21848             }
21849         }
21850         Safefree(ri->data->what);
21851         Safefree(ri->data);
21852     }
21853
21854     Safefree(ri);
21855 }
21856
21857 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21858 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21859 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
21860
21861 /*
21862 =for apidoc_section REGEXP Functions
21863 =for apidoc re_dup_guts
21864 Duplicate a regexp.
21865
21866 This routine is expected to clone a given regexp structure. It is only
21867 compiled under USE_ITHREADS.
21868
21869 After all of the core data stored in struct regexp is duplicated
21870 the regexp_engine.dupe method is used to copy any private data
21871 stored in the *pprivate pointer. This allows extensions to handle
21872 any duplication they need to do.
21873
21874 =cut
21875
21876    See pregfree() and regfree_internal() if you change anything here.
21877 */
21878 #if defined(USE_ITHREADS)
21879 #ifndef PERL_IN_XSUB_RE
21880 void
21881 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21882 {
21883     I32 npar;
21884     const struct regexp *r = ReANY(sstr);
21885     struct regexp *ret = ReANY(dstr);
21886
21887     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21888
21889     npar = r->nparens+1;
21890     Newx(ret->offs, npar, regexp_paren_pair);
21891     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21892
21893     if (ret->substrs) {
21894         /* Do it this way to avoid reading from *r after the StructCopy().
21895            That way, if any of the sv_dup_inc()s dislodge *r from the L1
21896            cache, it doesn't matter.  */
21897         int i;
21898         const bool anchored = r->check_substr
21899             ? r->check_substr == r->substrs->data[0].substr
21900             : r->check_utf8   == r->substrs->data[0].utf8_substr;
21901         Newx(ret->substrs, 1, struct reg_substr_data);
21902         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21903
21904         for (i = 0; i < 2; i++) {
21905             ret->substrs->data[i].substr =
21906                         sv_dup_inc(ret->substrs->data[i].substr, param);
21907             ret->substrs->data[i].utf8_substr =
21908                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21909         }
21910
21911         /* check_substr and check_utf8, if non-NULL, point to either their
21912            anchored or float namesakes, and don't hold a second reference.  */
21913
21914         if (ret->check_substr) {
21915             if (anchored) {
21916                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21917
21918                 ret->check_substr = ret->substrs->data[0].substr;
21919                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21920             } else {
21921                 assert(r->check_substr == r->substrs->data[1].substr);
21922                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21923
21924                 ret->check_substr = ret->substrs->data[1].substr;
21925                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21926             }
21927         } else if (ret->check_utf8) {
21928             if (anchored) {
21929                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21930             } else {
21931                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21932             }
21933         }
21934     }
21935
21936     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21937     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21938     if (r->recurse_locinput)
21939         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21940
21941     if (ret->pprivate)
21942         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21943
21944     if (RX_MATCH_COPIED(dstr))
21945         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21946     else
21947         ret->subbeg = NULL;
21948 #ifdef PERL_ANY_COW
21949     ret->saved_copy = NULL;
21950 #endif
21951
21952     /* Whether mother_re be set or no, we need to copy the string.  We
21953        cannot refrain from copying it when the storage points directly to
21954        our mother regexp, because that's
21955                1: a buffer in a different thread
21956                2: something we no longer hold a reference on
21957                so we need to copy it locally.  */
21958     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21959     /* set malloced length to a non-zero value so it will be freed
21960      * (otherwise in combination with SVf_FAKE it looks like an alien
21961      * buffer). It doesn't have to be the actual malloced size, since it
21962      * should never be grown */
21963     SvLEN_set(dstr, SvCUR(sstr)+1);
21964     ret->mother_re   = NULL;
21965 }
21966 #endif /* PERL_IN_XSUB_RE */
21967
21968 /*
21969    regdupe_internal()
21970
21971    This is the internal complement to regdupe() which is used to copy
21972    the structure pointed to by the *pprivate pointer in the regexp.
21973    This is the core version of the extension overridable cloning hook.
21974    The regexp structure being duplicated will be copied by perl prior
21975    to this and will be provided as the regexp *r argument, however
21976    with the /old/ structures pprivate pointer value. Thus this routine
21977    may override any copying normally done by perl.
21978
21979    It returns a pointer to the new regexp_internal structure.
21980 */
21981
21982 void *
21983 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21984 {
21985     struct regexp *const r = ReANY(rx);
21986     regexp_internal *reti;
21987     int len;
21988     RXi_GET_DECL(r, ri);
21989
21990     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21991
21992     len = ProgLen(ri);
21993
21994     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21995           char, regexp_internal);
21996     Copy(ri->program, reti->program, len+1, regnode);
21997
21998
21999     if (ri->code_blocks) {
22000         int n;
22001         Newx(reti->code_blocks, 1, struct reg_code_blocks);
22002         Newx(reti->code_blocks->cb, ri->code_blocks->count,
22003                     struct reg_code_block);
22004         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22005              ri->code_blocks->count, struct reg_code_block);
22006         for (n = 0; n < ri->code_blocks->count; n++)
22007              reti->code_blocks->cb[n].src_regex = (REGEXP*)
22008                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22009         reti->code_blocks->count = ri->code_blocks->count;
22010         reti->code_blocks->refcnt = 1;
22011     }
22012     else
22013         reti->code_blocks = NULL;
22014
22015     reti->regstclass = NULL;
22016
22017     if (ri->data) {
22018         struct reg_data *d;
22019         const int count = ri->data->count;
22020         int i;
22021
22022         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22023                 char, struct reg_data);
22024         Newx(d->what, count, U8);
22025
22026         d->count = count;
22027         for (i = 0; i < count; i++) {
22028             d->what[i] = ri->data->what[i];
22029             switch (d->what[i]) {
22030                 /* see also regcomp.h and regfree_internal() */
22031             case 'a': /* actually an AV, but the dup function is identical.
22032                          values seem to be "plain sv's" generally. */
22033             case 'r': /* a compiled regex (but still just another SV) */
22034             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22035                          this use case should go away, the code could have used
22036                          'a' instead - see S_set_ANYOF_arg() for array contents. */
22037             case 'S': /* actually an SV, but the dup function is identical.  */
22038             case 'u': /* actually an HV, but the dup function is identical.
22039                          values are "plain sv's" */
22040                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22041                 break;
22042             case 'f':
22043                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22044                  * patterns which could start with several different things. Pre-TRIE
22045                  * this was more important than it is now, however this still helps
22046                  * in some places, for instance /x?a+/ might produce a SSC equivalent
22047                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22048                  * in regexec.c
22049                  */
22050                 /* This is cheating. */
22051                 Newx(d->data[i], 1, regnode_ssc);
22052                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22053                 reti->regstclass = (regnode*)d->data[i];
22054                 break;
22055             case 'T':
22056                 /* AHO-CORASICK fail table */
22057                 /* Trie stclasses are readonly and can thus be shared
22058                  * without duplication. We free the stclass in pregfree
22059                  * when the corresponding reg_ac_data struct is freed.
22060                  */
22061                 reti->regstclass= ri->regstclass;
22062                 /* FALLTHROUGH */
22063             case 't':
22064                 /* TRIE transition table */
22065                 OP_REFCNT_LOCK;
22066                 ((reg_trie_data*)ri->data->data[i])->refcount++;
22067                 OP_REFCNT_UNLOCK;
22068                 /* FALLTHROUGH */
22069             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22070             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22071                          is not from another regexp */
22072                 d->data[i] = ri->data->data[i];
22073                 break;
22074             default:
22075                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22076                                                            ri->data->what[i]);
22077             }
22078         }
22079
22080         reti->data = d;
22081     }
22082     else
22083         reti->data = NULL;
22084
22085     reti->name_list_idx = ri->name_list_idx;
22086
22087 #ifdef RE_TRACK_PATTERN_OFFSETS
22088     if (ri->u.offsets) {
22089         Newx(reti->u.offsets, 2*len+1, U32);
22090         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22091     }
22092 #else
22093     SetProgLen(reti, len);
22094 #endif
22095
22096     return (void*)reti;
22097 }
22098
22099 #endif    /* USE_ITHREADS */
22100
22101 #ifndef PERL_IN_XSUB_RE
22102
22103 /*
22104  - regnext - dig the "next" pointer out of a node
22105  */
22106 regnode *
22107 Perl_regnext(pTHX_ regnode *p)
22108 {
22109     I32 offset;
22110
22111     if (!p)
22112         return(NULL);
22113
22114     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
22115         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22116                                                 (int)OP(p), (int)REGNODE_MAX);
22117     }
22118
22119     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22120     if (offset == 0)
22121         return(NULL);
22122
22123     return(p+offset);
22124 }
22125
22126 #endif
22127
22128 STATIC void
22129 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22130 {
22131     va_list args;
22132     STRLEN len = strlen(pat);
22133     char buf[512];
22134     SV *msv;
22135     const char *message;
22136
22137     PERL_ARGS_ASSERT_RE_CROAK;
22138
22139     if (len > 510)
22140         len = 510;
22141     Copy(pat, buf, len , char);
22142     buf[len] = '\n';
22143     buf[len + 1] = '\0';
22144     va_start(args, pat);
22145     msv = vmess(buf, &args);
22146     va_end(args);
22147     message = SvPV_const(msv, len);
22148     if (len > 512)
22149         len = 512;
22150     Copy(message, buf, len , char);
22151     /* len-1 to avoid \n */
22152     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22153 }
22154
22155 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
22156
22157 #ifndef PERL_IN_XSUB_RE
22158 void
22159 Perl_save_re_context(pTHX)
22160 {
22161     I32 nparens = -1;
22162     I32 i;
22163
22164     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22165
22166     if (PL_curpm) {
22167         const REGEXP * const rx = PM_GETRE(PL_curpm);
22168         if (rx)
22169             nparens = RX_NPARENS(rx);
22170     }
22171
22172     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22173      * that PL_curpm will be null, but that utf8.pm and the modules it
22174      * loads will only use $1..$3.
22175      * The t/porting/re_context.t test file checks this assumption.
22176      */
22177     if (nparens == -1)
22178         nparens = 3;
22179
22180     for (i = 1; i <= nparens; i++) {
22181         char digits[TYPE_CHARS(long)];
22182         const STRLEN len = my_snprintf(digits, sizeof(digits),
22183                                        "%lu", (long)i);
22184         GV *const *const gvp
22185             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22186
22187         if (gvp) {
22188             GV * const gv = *gvp;
22189             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22190                 save_scalar(gv);
22191         }
22192     }
22193 }
22194 #endif
22195
22196 #ifdef DEBUGGING
22197
22198 STATIC void
22199 S_put_code_point(pTHX_ SV *sv, UV c)
22200 {
22201     PERL_ARGS_ASSERT_PUT_CODE_POINT;
22202
22203     if (c > 255) {
22204         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22205     }
22206     else if (isPRINT(c)) {
22207         const char string = (char) c;
22208
22209         /* We use {phrase} as metanotation in the class, so also escape literal
22210          * braces */
22211         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22212             sv_catpvs(sv, "\\");
22213         sv_catpvn(sv, &string, 1);
22214     }
22215     else if (isMNEMONIC_CNTRL(c)) {
22216         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22217     }
22218     else {
22219         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22220     }
22221 }
22222
22223 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22224
22225 STATIC void
22226 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22227 {
22228     /* Appends to 'sv' a displayable version of the range of code points from
22229      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
22230      * that have them, when they occur at the beginning or end of the range.
22231      * It uses hex to output the remaining code points, unless 'allow_literals'
22232      * is true, in which case the printable ASCII ones are output as-is (though
22233      * some of these will be escaped by put_code_point()).
22234      *
22235      * NOTE:  This is designed only for printing ranges of code points that fit
22236      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
22237      */
22238
22239     const unsigned int min_range_count = 3;
22240
22241     assert(start <= end);
22242
22243     PERL_ARGS_ASSERT_PUT_RANGE;
22244
22245     while (start <= end) {
22246         UV this_end;
22247         const char * format;
22248
22249         if (    end - start < min_range_count
22250             && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
22251         {
22252             /* Output a range of 1 or 2 chars individually, or longer ranges
22253              * when printable */
22254             for (; start <= end; start++) {
22255                 put_code_point(sv, start);
22256             }
22257             break;
22258         }
22259
22260         /* If permitted by the input options, and there is a possibility that
22261          * this range contains a printable literal, look to see if there is
22262          * one. */
22263         if (allow_literals && start <= MAX_PRINT_A) {
22264
22265             /* If the character at the beginning of the range isn't an ASCII
22266              * printable, effectively split the range into two parts:
22267              *  1) the portion before the first such printable,
22268              *  2) the rest
22269              * and output them separately. */
22270             if (! isPRINT_A(start)) {
22271                 UV temp_end = start + 1;
22272
22273                 /* There is no point looking beyond the final possible
22274                  * printable, in MAX_PRINT_A */
22275                 UV max = MIN(end, MAX_PRINT_A);
22276
22277                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22278                     temp_end++;
22279                 }
22280
22281                 /* Here, temp_end points to one beyond the first printable if
22282                  * found, or to one beyond 'max' if not.  If none found, make
22283                  * sure that we use the entire range */
22284                 if (temp_end > MAX_PRINT_A) {
22285                     temp_end = end + 1;
22286                 }
22287
22288                 /* Output the first part of the split range: the part that
22289                  * doesn't have printables, with the parameter set to not look
22290                  * for literals (otherwise we would infinitely recurse) */
22291                 put_range(sv, start, temp_end - 1, FALSE);
22292
22293                 /* The 2nd part of the range (if any) starts here. */
22294                 start = temp_end;
22295
22296                 /* We do a continue, instead of dropping down, because even if
22297                  * the 2nd part is non-empty, it could be so short that we want
22298                  * to output it as individual characters, as tested for at the
22299                  * top of this loop.  */
22300                 continue;
22301             }
22302
22303             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
22304              * output a sub-range of just the digits or letters, then process
22305              * the remaining portion as usual. */
22306             if (isALPHANUMERIC_A(start)) {
22307                 UV mask = (isDIGIT_A(start))
22308                            ? _CC_DIGIT
22309                              : isUPPER_A(start)
22310                                ? _CC_UPPER
22311                                : _CC_LOWER;
22312                 UV temp_end = start + 1;
22313
22314                 /* Find the end of the sub-range that includes just the
22315                  * characters in the same class as the first character in it */
22316                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22317                     temp_end++;
22318                 }
22319                 temp_end--;
22320
22321                 /* For short ranges, don't duplicate the code above to output
22322                  * them; just call recursively */
22323                 if (temp_end - start < min_range_count) {
22324                     put_range(sv, start, temp_end, FALSE);
22325                 }
22326                 else {  /* Output as a range */
22327                     put_code_point(sv, start);
22328                     sv_catpvs(sv, "-");
22329                     put_code_point(sv, temp_end);
22330                 }
22331                 start = temp_end + 1;
22332                 continue;
22333             }
22334
22335             /* We output any other printables as individual characters */
22336             if (isPUNCT_A(start) || isSPACE_A(start)) {
22337                 while (start <= end && (isPUNCT_A(start)
22338                                         || isSPACE_A(start)))
22339                 {
22340                     put_code_point(sv, start);
22341                     start++;
22342                 }
22343                 continue;
22344             }
22345         } /* End of looking for literals */
22346
22347         /* Here is not to output as a literal.  Some control characters have
22348          * mnemonic names.  Split off any of those at the beginning and end of
22349          * the range to print mnemonically.  It isn't possible for many of
22350          * these to be in a row, so this won't overwhelm with output */
22351         if (   start <= end
22352             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22353         {
22354             while (isMNEMONIC_CNTRL(start) && start <= end) {
22355                 put_code_point(sv, start);
22356                 start++;
22357             }
22358
22359             /* If this didn't take care of the whole range ... */
22360             if (start <= end) {
22361
22362                 /* Look backwards from the end to find the final non-mnemonic
22363                  * */
22364                 UV temp_end = end;
22365                 while (isMNEMONIC_CNTRL(temp_end)) {
22366                     temp_end--;
22367                 }
22368
22369                 /* And separately output the interior range that doesn't start
22370                  * or end with mnemonics */
22371                 put_range(sv, start, temp_end, FALSE);
22372
22373                 /* Then output the mnemonic trailing controls */
22374                 start = temp_end + 1;
22375                 while (start <= end) {
22376                     put_code_point(sv, start);
22377                     start++;
22378                 }
22379                 break;
22380             }
22381         }
22382
22383         /* As a final resort, output the range or subrange as hex. */
22384
22385         if (start >= NUM_ANYOF_CODE_POINTS) {
22386             this_end = end;
22387         }
22388         else {  /* Have to split range at the bitmap boundary */
22389             this_end = (end < NUM_ANYOF_CODE_POINTS)
22390                         ? end
22391                         : NUM_ANYOF_CODE_POINTS - 1;
22392         }
22393 #if NUM_ANYOF_CODE_POINTS > 256
22394         format = (this_end < 256)
22395                  ? "\\x%02" UVXf "-\\x%02" UVXf
22396                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22397 #else
22398         format = "\\x%02" UVXf "-\\x%02" UVXf;
22399 #endif
22400         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22401         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22402         GCC_DIAG_RESTORE_STMT;
22403         break;
22404     }
22405 }
22406
22407 STATIC void
22408 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22409 {
22410     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22411      * 'invlist' */
22412
22413     UV start, end;
22414     bool allow_literals = TRUE;
22415
22416     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22417
22418     /* Generally, it is more readable if printable characters are output as
22419      * literals, but if a range (nearly) spans all of them, it's best to output
22420      * it as a single range.  This code will use a single range if all but 2
22421      * ASCII printables are in it */
22422     invlist_iterinit(invlist);
22423     while (invlist_iternext(invlist, &start, &end)) {
22424
22425         /* If the range starts beyond the final printable, it doesn't have any
22426          * in it */
22427         if (start > MAX_PRINT_A) {
22428             break;
22429         }
22430
22431         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
22432          * all but two, the range must start and end no later than 2 from
22433          * either end */
22434         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22435             if (end > MAX_PRINT_A) {
22436                 end = MAX_PRINT_A;
22437             }
22438             if (start < ' ') {
22439                 start = ' ';
22440             }
22441             if (end - start >= MAX_PRINT_A - ' ' - 2) {
22442                 allow_literals = FALSE;
22443             }
22444             break;
22445         }
22446     }
22447     invlist_iterfinish(invlist);
22448
22449     /* Here we have figured things out.  Output each range */
22450     invlist_iterinit(invlist);
22451     while (invlist_iternext(invlist, &start, &end)) {
22452         if (start >= NUM_ANYOF_CODE_POINTS) {
22453             break;
22454         }
22455         put_range(sv, start, end, allow_literals);
22456     }
22457     invlist_iterfinish(invlist);
22458
22459     return;
22460 }
22461
22462 STATIC SV*
22463 S_put_charclass_bitmap_innards_common(pTHX_
22464         SV* invlist,            /* The bitmap */
22465         SV* posixes,            /* Under /l, things like [:word:], \S */
22466         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
22467         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
22468         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
22469         const bool invert       /* Is the result to be inverted? */
22470 )
22471 {
22472     /* Create and return an SV containing a displayable version of the bitmap
22473      * and associated information determined by the input parameters.  If the
22474      * output would have been only the inversion indicator '^', NULL is instead
22475      * returned. */
22476
22477     SV * output;
22478
22479     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22480
22481     if (invert) {
22482         output = newSVpvs("^");
22483     }
22484     else {
22485         output = newSVpvs("");
22486     }
22487
22488     /* First, the code points in the bitmap that are unconditionally there */
22489     put_charclass_bitmap_innards_invlist(output, invlist);
22490
22491     /* Traditionally, these have been placed after the main code points */
22492     if (posixes) {
22493         sv_catsv(output, posixes);
22494     }
22495
22496     if (only_utf8 && _invlist_len(only_utf8)) {
22497         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22498         put_charclass_bitmap_innards_invlist(output, only_utf8);
22499     }
22500
22501     if (not_utf8 && _invlist_len(not_utf8)) {
22502         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22503         put_charclass_bitmap_innards_invlist(output, not_utf8);
22504     }
22505
22506     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22507         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22508         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22509
22510         /* This is the only list in this routine that can legally contain code
22511          * points outside the bitmap range.  The call just above to
22512          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22513          * output them here.  There's about a half-dozen possible, and none in
22514          * contiguous ranges longer than 2 */
22515         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22516             UV start, end;
22517             SV* above_bitmap = NULL;
22518
22519             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22520
22521             invlist_iterinit(above_bitmap);
22522             while (invlist_iternext(above_bitmap, &start, &end)) {
22523                 UV i;
22524
22525                 for (i = start; i <= end; i++) {
22526                     put_code_point(output, i);
22527                 }
22528             }
22529             invlist_iterfinish(above_bitmap);
22530             SvREFCNT_dec_NN(above_bitmap);
22531         }
22532     }
22533
22534     if (invert && SvCUR(output) == 1) {
22535         return NULL;
22536     }
22537
22538     return output;
22539 }
22540
22541 STATIC bool
22542 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22543                                      char *bitmap,
22544                                      SV *nonbitmap_invlist,
22545                                      SV *only_utf8_locale_invlist,
22546                                      const regnode * const node,
22547                                      const U8 flags,
22548                                      const bool force_as_is_display)
22549 {
22550     /* Appends to 'sv' a displayable version of the innards of the bracketed
22551      * character class defined by the other arguments:
22552      *  'bitmap' points to the bitmap, or NULL if to ignore that.
22553      *  'nonbitmap_invlist' is an inversion list of the code points that are in
22554      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
22555      *      none.  The reasons for this could be that they require some
22556      *      condition such as the target string being or not being in UTF-8
22557      *      (under /d), or because they came from a user-defined property that
22558      *      was not resolved at the time of the regex compilation (under /u)
22559      *  'only_utf8_locale_invlist' is an inversion list of the code points that
22560      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
22561      *  'node' is the regex pattern ANYOF node.  It is needed only when the
22562      *      above two parameters are not null, and is passed so that this
22563      *      routine can tease apart the various reasons for them.
22564      *  'flags' is the flags field of 'node'
22565      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
22566      *      to invert things to see if that leads to a cleaner display.  If
22567      *      FALSE, this routine is free to use its judgment about doing this.
22568      *
22569      * It returns TRUE if there was actually something output.  (It may be that
22570      * the bitmap, etc is empty.)
22571      *
22572      * When called for outputting the bitmap of a non-ANYOF node, just pass the
22573      * bitmap, with the succeeding parameters set to NULL, and the final one to
22574      * FALSE.
22575      */
22576
22577     /* In general, it tries to display the 'cleanest' representation of the
22578      * innards, choosing whether to display them inverted or not, regardless of
22579      * whether the class itself is to be inverted.  However,  there are some
22580      * cases where it can't try inverting, as what actually matches isn't known
22581      * until runtime, and hence the inversion isn't either. */
22582
22583     bool inverting_allowed = ! force_as_is_display;
22584
22585     int i;
22586     STRLEN orig_sv_cur = SvCUR(sv);
22587
22588     SV* invlist;            /* Inversion list we accumulate of code points that
22589                                are unconditionally matched */
22590     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
22591                                UTF-8 */
22592     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
22593                              */
22594     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
22595     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
22596                                        is UTF-8 */
22597
22598     SV* as_is_display;      /* The output string when we take the inputs
22599                                literally */
22600     SV* inverted_display;   /* The output string when we invert the inputs */
22601
22602     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
22603                                                    to match? */
22604     /* We are biased in favor of displaying things without them being inverted,
22605      * as that is generally easier to understand */
22606     const int bias = 5;
22607
22608     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22609
22610     /* Start off with whatever code points are passed in.  (We clone, so we
22611      * don't change the caller's list) */
22612     if (nonbitmap_invlist) {
22613         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22614         invlist = invlist_clone(nonbitmap_invlist, NULL);
22615     }
22616     else {  /* Worst case size is every other code point is matched */
22617         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22618     }
22619
22620     if (flags) {
22621         if (OP(node) == ANYOFD) {
22622
22623             /* This flag indicates that the code points below 0x100 in the
22624              * nonbitmap list are precisely the ones that match only when the
22625              * target is UTF-8 (they should all be non-ASCII). */
22626             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22627             {
22628                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22629                 _invlist_subtract(invlist, only_utf8, &invlist);
22630             }
22631
22632             /* And this flag for matching all non-ASCII 0xFF and below */
22633             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22634             {
22635                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22636             }
22637         }
22638         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22639
22640             /* If either of these flags are set, what matches isn't
22641              * determinable except during execution, so don't know enough here
22642              * to invert */
22643             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22644                 inverting_allowed = FALSE;
22645             }
22646
22647             /* What the posix classes match also varies at runtime, so these
22648              * will be output symbolically. */
22649             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22650                 int i;
22651
22652                 posixes = newSVpvs("");
22653                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22654                     if (ANYOF_POSIXL_TEST(node, i)) {
22655                         sv_catpv(posixes, anyofs[i]);
22656                     }
22657                 }
22658             }
22659         }
22660     }
22661
22662     /* Accumulate the bit map into the unconditional match list */
22663     if (bitmap) {
22664         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22665             if (BITMAP_TEST(bitmap, i)) {
22666                 int start = i++;
22667                 for (;
22668                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22669                      i++)
22670                 { /* empty */ }
22671                 invlist = _add_range_to_invlist(invlist, start, i-1);
22672             }
22673         }
22674     }
22675
22676     /* Make sure that the conditional match lists don't have anything in them
22677      * that match unconditionally; otherwise the output is quite confusing.
22678      * This could happen if the code that populates these misses some
22679      * duplication. */
22680     if (only_utf8) {
22681         _invlist_subtract(only_utf8, invlist, &only_utf8);
22682     }
22683     if (not_utf8) {
22684         _invlist_subtract(not_utf8, invlist, &not_utf8);
22685     }
22686
22687     if (only_utf8_locale_invlist) {
22688
22689         /* Since this list is passed in, we have to make a copy before
22690          * modifying it */
22691         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22692
22693         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22694
22695         /* And, it can get really weird for us to try outputting an inverted
22696          * form of this list when it has things above the bitmap, so don't even
22697          * try */
22698         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22699             inverting_allowed = FALSE;
22700         }
22701     }
22702
22703     /* Calculate what the output would be if we take the input as-is */
22704     as_is_display = put_charclass_bitmap_innards_common(invlist,
22705                                                     posixes,
22706                                                     only_utf8,
22707                                                     not_utf8,
22708                                                     only_utf8_locale,
22709                                                     invert);
22710
22711     /* If have to take the output as-is, just do that */
22712     if (! inverting_allowed) {
22713         if (as_is_display) {
22714             sv_catsv(sv, as_is_display);
22715             SvREFCNT_dec_NN(as_is_display);
22716         }
22717     }
22718     else { /* But otherwise, create the output again on the inverted input, and
22719               use whichever version is shorter */
22720
22721         int inverted_bias, as_is_bias;
22722
22723         /* We will apply our bias to whichever of the results doesn't have
22724          * the '^' */
22725         if (invert) {
22726             invert = FALSE;
22727             as_is_bias = bias;
22728             inverted_bias = 0;
22729         }
22730         else {
22731             invert = TRUE;
22732             as_is_bias = 0;
22733             inverted_bias = bias;
22734         }
22735
22736         /* Now invert each of the lists that contribute to the output,
22737          * excluding from the result things outside the possible range */
22738
22739         /* For the unconditional inversion list, we have to add in all the
22740          * conditional code points, so that when inverted, they will be gone
22741          * from it */
22742         _invlist_union(only_utf8, invlist, &invlist);
22743         _invlist_union(not_utf8, invlist, &invlist);
22744         _invlist_union(only_utf8_locale, invlist, &invlist);
22745         _invlist_invert(invlist);
22746         _invlist_intersection(invlist, PL_InBitmap, &invlist);
22747
22748         if (only_utf8) {
22749             _invlist_invert(only_utf8);
22750             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22751         }
22752         else if (not_utf8) {
22753
22754             /* If a code point matches iff the target string is not in UTF-8,
22755              * then complementing the result has it not match iff not in UTF-8,
22756              * which is the same thing as matching iff it is UTF-8. */
22757             only_utf8 = not_utf8;
22758             not_utf8 = NULL;
22759         }
22760
22761         if (only_utf8_locale) {
22762             _invlist_invert(only_utf8_locale);
22763             _invlist_intersection(only_utf8_locale,
22764                                   PL_InBitmap,
22765                                   &only_utf8_locale);
22766         }
22767
22768         inverted_display = put_charclass_bitmap_innards_common(
22769                                             invlist,
22770                                             posixes,
22771                                             only_utf8,
22772                                             not_utf8,
22773                                             only_utf8_locale, invert);
22774
22775         /* Use the shortest representation, taking into account our bias
22776          * against showing it inverted */
22777         if (   inverted_display
22778             && (   ! as_is_display
22779                 || (  SvCUR(inverted_display) + inverted_bias
22780                     < SvCUR(as_is_display)    + as_is_bias)))
22781         {
22782             sv_catsv(sv, inverted_display);
22783         }
22784         else if (as_is_display) {
22785             sv_catsv(sv, as_is_display);
22786         }
22787
22788         SvREFCNT_dec(as_is_display);
22789         SvREFCNT_dec(inverted_display);
22790     }
22791
22792     SvREFCNT_dec_NN(invlist);
22793     SvREFCNT_dec(only_utf8);
22794     SvREFCNT_dec(not_utf8);
22795     SvREFCNT_dec(posixes);
22796     SvREFCNT_dec(only_utf8_locale);
22797
22798     return SvCUR(sv) > orig_sv_cur;
22799 }
22800
22801 #define CLEAR_OPTSTART                                                       \
22802     if (optstart) STMT_START {                                               \
22803         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
22804                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22805         optstart=NULL;                                                       \
22806     } STMT_END
22807
22808 #define DUMPUNTIL(b,e)                                                       \
22809                     CLEAR_OPTSTART;                                          \
22810                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22811
22812 STATIC const regnode *
22813 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22814             const regnode *last, const regnode *plast,
22815             SV* sv, I32 indent, U32 depth)
22816 {
22817     U8 op = PSEUDO;     /* Arbitrary non-END op. */
22818     const regnode *next;
22819     const regnode *optstart= NULL;
22820
22821     RXi_GET_DECL(r, ri);
22822     DECLARE_AND_GET_RE_DEBUG_FLAGS;
22823
22824     PERL_ARGS_ASSERT_DUMPUNTIL;
22825
22826 #ifdef DEBUG_DUMPUNTIL
22827     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22828         last ? last-start : 0, plast ? plast-start : 0);
22829 #endif
22830
22831     if (plast && plast < last)
22832         last= plast;
22833
22834     while (PL_regkind[op] != END && (!last || node < last)) {
22835         assert(node);
22836         /* While that wasn't END last time... */
22837         NODE_ALIGN(node);
22838         op = OP(node);
22839         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22840             indent--;
22841         next = regnext((regnode *)node);
22842
22843         /* Where, what. */
22844         if (OP(node) == OPTIMIZED) {
22845             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22846                 optstart = node;
22847             else
22848                 goto after_print;
22849         } else
22850             CLEAR_OPTSTART;
22851
22852         regprop(r, sv, node, NULL, NULL);
22853         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22854                       (int)(2*indent + 1), "", SvPVX_const(sv));
22855
22856         if (OP(node) != OPTIMIZED) {
22857             if (next == NULL)           /* Next ptr. */
22858                 Perl_re_printf( aTHX_  " (0)");
22859             else if (PL_regkind[(U8)op] == BRANCH
22860                      && PL_regkind[OP(next)] != BRANCH )
22861                 Perl_re_printf( aTHX_  " (FAIL)");
22862             else
22863                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22864             Perl_re_printf( aTHX_ "\n");
22865         }
22866
22867       after_print:
22868         if (PL_regkind[(U8)op] == BRANCHJ) {
22869             assert(next);
22870             {
22871                 const regnode *nnode = (OP(next) == LONGJMP
22872                                        ? regnext((regnode *)next)
22873                                        : next);
22874                 if (last && nnode > last)
22875                     nnode = last;
22876                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22877             }
22878         }
22879         else if (PL_regkind[(U8)op] == BRANCH) {
22880             assert(next);
22881             DUMPUNTIL(NEXTOPER(node), next);
22882         }
22883         else if ( PL_regkind[(U8)op]  == TRIE ) {
22884             const regnode *this_trie = node;
22885             const char op = OP(node);
22886             const U32 n = ARG(node);
22887             const reg_ac_data * const ac = op>=AHOCORASICK ?
22888                (reg_ac_data *)ri->data->data[n] :
22889                NULL;
22890             const reg_trie_data * const trie =
22891                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22892 #ifdef DEBUGGING
22893             AV *const trie_words
22894                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22895 #endif
22896             const regnode *nextbranch= NULL;
22897             I32 word_idx;
22898             SvPVCLEAR(sv);
22899             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22900                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22901
22902                 Perl_re_indentf( aTHX_  "%s ",
22903                     indent+3,
22904                     elem_ptr
22905                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22906                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22907                                 PL_colors[0], PL_colors[1],
22908                                 (SvUTF8(*elem_ptr)
22909                                  ? PERL_PV_ESCAPE_UNI
22910                                  : 0)
22911                                 | PERL_PV_PRETTY_ELLIPSES
22912                                 | PERL_PV_PRETTY_LTGT
22913                             )
22914                     : "???"
22915                 );
22916                 if (trie->jump) {
22917                     U16 dist= trie->jump[word_idx+1];
22918                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22919                                (UV)((dist ? this_trie + dist : next) - start));
22920                     if (dist) {
22921                         if (!nextbranch)
22922                             nextbranch= this_trie + trie->jump[0];
22923                         DUMPUNTIL(this_trie + dist, nextbranch);
22924                     }
22925                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22926                         nextbranch= regnext((regnode *)nextbranch);
22927                 } else {
22928                     Perl_re_printf( aTHX_  "\n");
22929                 }
22930             }
22931             if (last && next > last)
22932                 node= last;
22933             else
22934                 node= next;
22935         }
22936         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22937             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22938                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22939         }
22940         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22941             assert(next);
22942             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22943         }
22944         else if ( op == PLUS || op == STAR) {
22945             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22946         }
22947         else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22948             /* Literal string, where present. */
22949             node += NODE_SZ_STR(node) - 1;
22950             node = NEXTOPER(node);
22951         }
22952         else {
22953             node = NEXTOPER(node);
22954             node += regarglen[(U8)op];
22955         }
22956         if (op == CURLYX || op == OPEN || op == SROPEN)
22957             indent++;
22958     }
22959     CLEAR_OPTSTART;
22960 #ifdef DEBUG_DUMPUNTIL
22961     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22962 #endif
22963     return node;
22964 }
22965
22966 #endif  /* DEBUGGING */
22967
22968 #ifndef PERL_IN_XSUB_RE
22969
22970 #  include "uni_keywords.h"
22971
22972 void
22973 Perl_init_uniprops(pTHX)
22974 {
22975
22976 #  ifdef DEBUGGING
22977     char * dump_len_string;
22978
22979     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22980     if (   ! dump_len_string
22981         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22982     {
22983         PL_dump_re_max_len = 60;    /* A reasonable default */
22984     }
22985 #  endif
22986
22987     PL_user_def_props = newHV();
22988
22989 #  ifdef USE_ITHREADS
22990
22991     HvSHAREKEYS_off(PL_user_def_props);
22992     PL_user_def_props_aTHX = aTHX;
22993
22994 #  endif
22995
22996     /* Set up the inversion list interpreter-level variables */
22997
22998     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22999     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
23000     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
23001     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23002     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23003     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23004     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23005     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23006     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23007     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23008     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23009     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23010     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23011     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23012     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23013     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23014
23015     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23016     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23017     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23018     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23019     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23020     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23021     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23022     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23023     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23024     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23025     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23026     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23027     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23028     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23029     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23030     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23031
23032     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23033     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23034     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23035     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23036     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23037
23038     PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23039     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23040     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23041     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23042
23043     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23044
23045     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23046     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23047
23048     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23049     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23050
23051     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23052     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23053                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23054     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23055                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23056     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23057     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23058     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23059     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23060     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23061     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23062     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23063     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23064     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23065
23066 #  ifdef UNI_XIDC
23067     /* The below are used only by deprecated functions.  They could be removed */
23068     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23069     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23070     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23071 #  endif
23072 }
23073
23074 /* These four functions are compiled only in regcomp.c, where they have access
23075  * to the data they return.  They are a way for re_comp.c to get access to that
23076  * data without having to compile the whole data structures. */
23077
23078 I16
23079 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23080 {
23081     PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23082
23083     return match_uniprop((U8 *) key, key_len);
23084 }
23085
23086 SV *
23087 Perl_get_prop_definition(pTHX_ const int table_index)
23088 {
23089     PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23090
23091     /* Create and return the inversion list */
23092     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23093 }
23094
23095 const char * const *
23096 Perl_get_prop_values(const int table_index)
23097 {
23098     PERL_ARGS_ASSERT_GET_PROP_VALUES;
23099
23100     return UNI_prop_value_ptrs[table_index];
23101 }
23102
23103 const char *
23104 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23105 {
23106     PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23107
23108     return deprecated_property_msgs[warning_offset];
23109 }
23110
23111 #  if 0
23112
23113 This code was mainly added for backcompat to give a warning for non-portable
23114 code points in user-defined properties.  But experiments showed that the
23115 warning in earlier perls were only omitted on overflow, which should be an
23116 error, so there really isnt a backcompat issue, and actually adding the
23117 warning when none was present before might cause breakage, for little gain.  So
23118 khw left this code in, but not enabled.  Tests were never added.
23119
23120 embed.fnc entry:
23121 Ei      |const char *|get_extended_utf8_msg|const UV cp
23122
23123 PERL_STATIC_INLINE const char *
23124 S_get_extended_utf8_msg(pTHX_ const UV cp)
23125 {
23126     U8 dummy[UTF8_MAXBYTES + 1];
23127     HV *msgs;
23128     SV **msg;
23129
23130     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23131                              &msgs);
23132
23133     msg = hv_fetchs(msgs, "text", 0);
23134     assert(msg);
23135
23136     (void) sv_2mortal((SV *) msgs);
23137
23138     return SvPVX(*msg);
23139 }
23140
23141 #  endif
23142 #endif /* end of ! PERL_IN_XSUB_RE */
23143
23144 STATIC REGEXP *
23145 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23146                          const bool ignore_case)
23147 {
23148     /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23149      * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
23150      * because nothing outside of ASCII will match.  Use /m because the input
23151      * string may be a bunch of lines strung together.
23152      *
23153      * Also sets up the debugging info */
23154
23155     U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23156     U32 rx_flags;
23157     SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23158     REGEXP * subpattern_re;
23159     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23160
23161     PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23162
23163     if (ignore_case) {
23164         flags |= PMf_FOLD;
23165     }
23166     set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23167
23168     /* Like in op.c, we copy the compile time pm flags to the rx ones */
23169     rx_flags = flags & RXf_PMf_COMPILETIME;
23170
23171 #ifndef PERL_IN_XSUB_RE
23172     /* Use the core engine if this file is regcomp.c.  That means no
23173      * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23174     subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23175                                              &PL_core_reg_engine,
23176                                              NULL, NULL,
23177                                              rx_flags, flags);
23178 #else
23179     if (isDEBUG_WILDCARD) {
23180         /* Use the special debugging engine if this file is re_comp.c and wants
23181          * to output the wildcard matching.  This uses whatever
23182          * 'use re "Debug ..." is in effect */
23183         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23184                                                  &my_reg_engine,
23185                                                  NULL, NULL,
23186                                                  rx_flags, flags);
23187     }
23188     else {
23189         /* Use the special wildcard engine if this file is re_comp.c and
23190          * doesn't want to output the wildcard matching.  This uses whatever
23191          * 'use re "Debug ..." is in effect for compilation, but this engine
23192          * structure has been set up so that it uses the core engine for
23193          * execution, so no execution debugging as a result of re.pm will be
23194          * displayed. */
23195         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23196                                                  &wild_reg_engine,
23197                                                  NULL, NULL,
23198                                                  rx_flags, flags);
23199         /* XXX The above has the effect that any user-supplied regex engine
23200          * won't be called for matching wildcards.  That might be good, or bad.
23201          * It could be changed in several ways.  The reason it is done the
23202          * current way is to avoid having to save and restore
23203          * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
23204          * could be used.  Another suggestion is to keep the authoritative
23205          * value of the debug flags in a thread-local variable and add set/get
23206          * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23207          * Still another is to pass a flag, say in the engine's intflags that
23208          * would be checked each time before doing the debug output */
23209     }
23210 #endif
23211
23212     assert(subpattern_re);  /* Should have died if didn't compile successfully */
23213     return subpattern_re;
23214 }
23215
23216 STATIC I32
23217 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23218          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23219 {
23220     I32 result;
23221     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23222
23223     PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23224
23225     ENTER;
23226
23227     /* The compilation has set things up so that if the program doesn't want to
23228      * see the wildcard matching procedure, it will get the core execution
23229      * engine, which is subject only to -Dr.  So we have to turn that off
23230      * around this procedure */
23231     if (! isDEBUG_WILDCARD) {
23232         /* Note! Casts away 'volatile' */
23233         SAVEI32(PL_debug);
23234         PL_debug &= ~ DEBUG_r_FLAG;
23235     }
23236
23237     result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23238                          NULL, nosave);
23239     LEAVE;
23240
23241     return result;
23242 }
23243
23244 SV *
23245 S_handle_user_defined_property(pTHX_
23246
23247     /* Parses the contents of a user-defined property definition; returning the
23248      * expanded definition if possible.  If so, the return is an inversion
23249      * list.
23250      *
23251      * If there are subroutines that are part of the expansion and which aren't
23252      * known at the time of the call to this function, this returns what
23253      * parse_uniprop_string() returned for the first one encountered.
23254      *
23255      * If an error was found, NULL is returned, and 'msg' gets a suitable
23256      * message appended to it.  (Appending allows the back trace of how we got
23257      * to the faulty definition to be displayed through nested calls of
23258      * user-defined subs.)
23259      *
23260      * The caller IS responsible for freeing any returned SV.
23261      *
23262      * The syntax of the contents is pretty much described in perlunicode.pod,
23263      * but we also allow comments on each line */
23264
23265     const char * name,          /* Name of property */
23266     const STRLEN name_len,      /* The name's length in bytes */
23267     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23268     const bool to_fold,         /* ? Is this under /i */
23269     const bool runtime,         /* ? Are we in compile- or run-time */
23270     const bool deferrable,      /* Is it ok for this property's full definition
23271                                    to be deferred until later? */
23272     SV* contents,               /* The property's definition */
23273     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
23274                                    getting called unless this is thought to be
23275                                    a user-defined property */
23276     SV * msg,                   /* Any error or warning msg(s) are appended to
23277                                    this */
23278     const STRLEN level)         /* Recursion level of this call */
23279 {
23280     STRLEN len;
23281     const char * string         = SvPV_const(contents, len);
23282     const char * const e        = string + len;
23283     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23284     const STRLEN msgs_length_on_entry = SvCUR(msg);
23285
23286     const char * s0 = string;   /* Points to first byte in the current line
23287                                    being parsed in 'string' */
23288     const char overflow_msg[] = "Code point too large in \"";
23289     SV* running_definition = NULL;
23290
23291     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23292
23293     *user_defined_ptr = TRUE;
23294
23295     /* Look at each line */
23296     while (s0 < e) {
23297         const char * s;     /* Current byte */
23298         char op = '+';      /* Default operation is 'union' */
23299         IV   min = 0;       /* range begin code point */
23300         IV   max = -1;      /* and range end */
23301         SV* this_definition;
23302
23303         /* Skip comment lines */
23304         if (*s0 == '#') {
23305             s0 = strchr(s0, '\n');
23306             if (s0 == NULL) {
23307                 break;
23308             }
23309             s0++;
23310             continue;
23311         }
23312
23313         /* For backcompat, allow an empty first line */
23314         if (*s0 == '\n') {
23315             s0++;
23316             continue;
23317         }
23318
23319         /* First character in the line may optionally be the operation */
23320         if (   *s0 == '+'
23321             || *s0 == '!'
23322             || *s0 == '-'
23323             || *s0 == '&')
23324         {
23325             op = *s0++;
23326         }
23327
23328         /* If the line is one or two hex digits separated by blank space, its
23329          * a range; otherwise it is either another user-defined property or an
23330          * error */
23331
23332         s = s0;
23333
23334         if (! isXDIGIT(*s)) {
23335             goto check_if_property;
23336         }
23337
23338         do { /* Each new hex digit will add 4 bits. */
23339             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23340                 s = strchr(s, '\n');
23341                 if (s == NULL) {
23342                     s = e;
23343                 }
23344                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23345                 sv_catpv(msg, overflow_msg);
23346                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23347                                      UTF8fARG(is_contents_utf8, s - s0, s0));
23348                 sv_catpvs(msg, "\"");
23349                 goto return_failure;
23350             }
23351
23352             /* Accumulate this digit into the value */
23353             min = (min << 4) + READ_XDIGIT(s);
23354         } while (isXDIGIT(*s));
23355
23356         while (isBLANK(*s)) { s++; }
23357
23358         /* We allow comments at the end of the line */
23359         if (*s == '#') {
23360             s = strchr(s, '\n');
23361             if (s == NULL) {
23362                 s = e;
23363             }
23364             s++;
23365         }
23366         else if (s < e && *s != '\n') {
23367             if (! isXDIGIT(*s)) {
23368                 goto check_if_property;
23369             }
23370
23371             /* Look for the high point of the range */
23372             max = 0;
23373             do {
23374                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23375                     s = strchr(s, '\n');
23376                     if (s == NULL) {
23377                         s = e;
23378                     }
23379                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23380                     sv_catpv(msg, overflow_msg);
23381                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23382                                       UTF8fARG(is_contents_utf8, s - s0, s0));
23383                     sv_catpvs(msg, "\"");
23384                     goto return_failure;
23385                 }
23386
23387                 max = (max << 4) + READ_XDIGIT(s);
23388             } while (isXDIGIT(*s));
23389
23390             while (isBLANK(*s)) { s++; }
23391
23392             if (*s == '#') {
23393                 s = strchr(s, '\n');
23394                 if (s == NULL) {
23395                     s = e;
23396                 }
23397             }
23398             else if (s < e && *s != '\n') {
23399                 goto check_if_property;
23400             }
23401         }
23402
23403         if (max == -1) {    /* The line only had one entry */
23404             max = min;
23405         }
23406         else if (max < min) {
23407             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23408             sv_catpvs(msg, "Illegal range in \"");
23409             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23410                                 UTF8fARG(is_contents_utf8, s - s0, s0));
23411             sv_catpvs(msg, "\"");
23412             goto return_failure;
23413         }
23414
23415 #  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
23416
23417         if (   UNICODE_IS_PERL_EXTENDED(min)
23418             || UNICODE_IS_PERL_EXTENDED(max))
23419         {
23420             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23421
23422             /* If both code points are non-portable, warn only on the lower
23423              * one. */
23424             sv_catpv(msg, get_extended_utf8_msg(
23425                                             (UNICODE_IS_PERL_EXTENDED(min))
23426                                             ? min : max));
23427             sv_catpvs(msg, " in \"");
23428             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23429                                  UTF8fARG(is_contents_utf8, s - s0, s0));
23430             sv_catpvs(msg, "\"");
23431         }
23432
23433 #  endif
23434
23435         /* Here, this line contains a legal range */
23436         this_definition = sv_2mortal(_new_invlist(2));
23437         this_definition = _add_range_to_invlist(this_definition, min, max);
23438         goto calculate;
23439
23440       check_if_property:
23441
23442         /* Here it isn't a legal range line.  See if it is a legal property
23443          * line.  First find the end of the meat of the line */
23444         s = strpbrk(s, "#\n");
23445         if (s == NULL) {
23446             s = e;
23447         }
23448
23449         /* Ignore trailing blanks in keeping with the requirements of
23450          * parse_uniprop_string() */
23451         s--;
23452         while (s > s0 && isBLANK_A(*s)) {
23453             s--;
23454         }
23455         s++;
23456
23457         this_definition = parse_uniprop_string(s0, s - s0,
23458                                                is_utf8, to_fold, runtime,
23459                                                deferrable,
23460                                                NULL,
23461                                                user_defined_ptr, msg,
23462                                                (name_len == 0)
23463                                                 ? level /* Don't increase level
23464                                                            if input is empty */
23465                                                 : level + 1
23466                                               );
23467         if (this_definition == NULL) {
23468             goto return_failure;    /* 'msg' should have had the reason
23469                                        appended to it by the above call */
23470         }
23471
23472         if (! is_invlist(this_definition)) {    /* Unknown at this time */
23473             return newSVsv(this_definition);
23474         }
23475
23476         if (*s != '\n') {
23477             s = strchr(s, '\n');
23478             if (s == NULL) {
23479                 s = e;
23480             }
23481         }
23482
23483       calculate:
23484
23485         switch (op) {
23486             case '+':
23487                 _invlist_union(running_definition, this_definition,
23488                                                         &running_definition);
23489                 break;
23490             case '-':
23491                 _invlist_subtract(running_definition, this_definition,
23492                                                         &running_definition);
23493                 break;
23494             case '&':
23495                 _invlist_intersection(running_definition, this_definition,
23496                                                         &running_definition);
23497                 break;
23498             case '!':
23499                 _invlist_union_complement_2nd(running_definition,
23500                                         this_definition, &running_definition);
23501                 break;
23502             default:
23503                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23504                                  __FILE__, __LINE__, op);
23505                 break;
23506         }
23507
23508         /* Position past the '\n' */
23509         s0 = s + 1;
23510     }   /* End of loop through the lines of 'contents' */
23511
23512     /* Here, we processed all the lines in 'contents' without error.  If we
23513      * didn't add any warnings, simply return success */
23514     if (msgs_length_on_entry == SvCUR(msg)) {
23515
23516         /* If the expansion was empty, the answer isn't nothing: its an empty
23517          * inversion list */
23518         if (running_definition == NULL) {
23519             running_definition = _new_invlist(1);
23520         }
23521
23522         return running_definition;
23523     }
23524
23525     /* Otherwise, add some explanatory text, but we will return success */
23526     goto return_msg;
23527
23528   return_failure:
23529     running_definition = NULL;
23530
23531   return_msg:
23532
23533     if (name_len > 0) {
23534         sv_catpvs(msg, " in expansion of ");
23535         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23536     }
23537
23538     return running_definition;
23539 }
23540
23541 /* As explained below, certain operations need to take place in the first
23542  * thread created.  These macros switch contexts */
23543 #  ifdef USE_ITHREADS
23544 #    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
23545                                         PerlInterpreter * save_aTHX = aTHX;
23546 #    define SWITCH_TO_GLOBAL_CONTEXT                                        \
23547                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23548 #    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
23549 #    define CUR_CONTEXT      aTHX
23550 #    define ORIGINAL_CONTEXT save_aTHX
23551 #  else
23552 #    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
23553 #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
23554 #    define RESTORE_CONTEXT                   NOOP
23555 #    define CUR_CONTEXT                       NULL
23556 #    define ORIGINAL_CONTEXT                  NULL
23557 #  endif
23558
23559 STATIC void
23560 S_delete_recursion_entry(pTHX_ void *key)
23561 {
23562     /* Deletes the entry used to detect recursion when expanding user-defined
23563      * properties.  This is a function so it can be set up to be called even if
23564      * the program unexpectedly quits */
23565
23566     SV ** current_entry;
23567     const STRLEN key_len = strlen((const char *) key);
23568     DECLARATION_FOR_GLOBAL_CONTEXT;
23569
23570     SWITCH_TO_GLOBAL_CONTEXT;
23571
23572     /* If the entry is one of these types, it is a permanent entry, and not the
23573      * one used to detect recursions.  This function should delete only the
23574      * recursion entry */
23575     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23576     if (     current_entry
23577         && ! is_invlist(*current_entry)
23578         && ! SvPOK(*current_entry))
23579     {
23580         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23581                                                                     G_DISCARD);
23582     }
23583
23584     RESTORE_CONTEXT;
23585 }
23586
23587 STATIC SV *
23588 S_get_fq_name(pTHX_
23589               const char * const name,    /* The first non-blank in the \p{}, \P{} */
23590               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
23591               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23592               const bool has_colon_colon
23593              )
23594 {
23595     /* Returns a mortal SV containing the fully qualified version of the input
23596      * name */
23597
23598     SV * fq_name;
23599
23600     fq_name = newSVpvs_flags("", SVs_TEMP);
23601
23602     /* Use the current package if it wasn't included in our input */
23603     if (! has_colon_colon) {
23604         const HV * pkg = (IN_PERL_COMPILETIME)
23605                          ? PL_curstash
23606                          : CopSTASH(PL_curcop);
23607         const char* pkgname = HvNAME(pkg);
23608
23609         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23610                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23611         sv_catpvs(fq_name, "::");
23612     }
23613
23614     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23615                          UTF8fARG(is_utf8, name_len, name));
23616     return fq_name;
23617 }
23618
23619 STATIC SV *
23620 S_parse_uniprop_string(pTHX_
23621
23622     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
23623      * now.  If so, the return is an inversion list.
23624      *
23625      * If the property is user-defined, it is a subroutine, which in turn
23626      * may call other subroutines.  This function will call the whole nest of
23627      * them to get the definition they return; if some aren't known at the time
23628      * of the call to this function, the fully qualified name of the highest
23629      * level sub is returned.  It is an error to call this function at runtime
23630      * without every sub defined.
23631      *
23632      * If an error was found, NULL is returned, and 'msg' gets a suitable
23633      * message appended to it.  (Appending allows the back trace of how we got
23634      * to the faulty definition to be displayed through nested calls of
23635      * user-defined subs.)
23636      *
23637      * The caller should NOT try to free any returned inversion list.
23638      *
23639      * Other parameters will be set on return as described below */
23640
23641     const char * const name,    /* The first non-blank in the \p{}, \P{} */
23642     Size_t name_len,            /* Its length in bytes, not including any
23643                                    trailing space */
23644     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23645     const bool to_fold,         /* ? Is this under /i */
23646     const bool runtime,         /* TRUE if this is being called at run time */
23647     const bool deferrable,      /* TRUE if it's ok for the definition to not be
23648                                    known at this call */
23649     AV ** strings,              /* To return string property values, like named
23650                                    sequences */
23651     bool *user_defined_ptr,     /* Upon return from this function it will be
23652                                    set to TRUE if any component is a
23653                                    user-defined property */
23654     SV * msg,                   /* Any error or warning msg(s) are appended to
23655                                    this */
23656     const STRLEN level)         /* Recursion level of this call */
23657 {
23658     char* lookup_name;          /* normalized name for lookup in our tables */
23659     unsigned lookup_len;        /* Its length */
23660     enum { Not_Strict = 0,      /* Some properties have stricter name */
23661            Strict,              /* normalization rules, which we decide */
23662            As_Is                /* upon based on parsing */
23663          } stricter = Not_Strict;
23664
23665     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23666      * (though it requires extra effort to download them from Unicode and
23667      * compile perl to know about them) */
23668     bool is_nv_type = FALSE;
23669
23670     unsigned int i, j = 0;
23671     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
23672     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
23673     int table_index = 0;    /* The entry number for this property in the table
23674                                of all Unicode property names */
23675     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
23676     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
23677                                    the normalized name in certain situations */
23678     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
23679                                    part of a package name */
23680     Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
23681     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
23682                                              property rather than a Unicode
23683                                              one. */
23684     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
23685                                      if an error.  If it is an inversion list,
23686                                      it is the definition.  Otherwise it is a
23687                                      string containing the fully qualified sub
23688                                      name of 'name' */
23689     SV * fq_name = NULL;        /* For user-defined properties, the fully
23690                                    qualified name */
23691     bool invert_return = FALSE; /* ? Do we need to complement the result before
23692                                      returning it */
23693     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23694                                        explicit utf8:: package that we strip
23695                                        off  */
23696     /* The expansion of properties that could be either user-defined or
23697      * official unicode ones is deferred until runtime, including a marker for
23698      * those that might be in the latter category.  This boolean indicates if
23699      * we've seen that marker.  If not, what we're parsing can't be such an
23700      * official Unicode property whose expansion was deferred */
23701     bool could_be_deferred_official = FALSE;
23702
23703     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23704
23705     /* The input will be normalized into 'lookup_name' */
23706     Newx(lookup_name, name_len, char);
23707     SAVEFREEPV(lookup_name);
23708
23709     /* Parse the input. */
23710     for (i = 0; i < name_len; i++) {
23711         char cur = name[i];
23712
23713         /* Most of the characters in the input will be of this ilk, being parts
23714          * of a name */
23715         if (isIDCONT_A(cur)) {
23716
23717             /* Case differences are ignored.  Our lookup routine assumes
23718              * everything is lowercase, so normalize to that */
23719             if (isUPPER_A(cur)) {
23720                 lookup_name[j++] = toLOWER_A(cur);
23721                 continue;
23722             }
23723
23724             if (cur == '_') { /* Don't include these in the normalized name */
23725                 continue;
23726             }
23727
23728             lookup_name[j++] = cur;
23729
23730             /* The first character in a user-defined name must be of this type.
23731              * */
23732             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23733                 could_be_user_defined = FALSE;
23734             }
23735
23736             continue;
23737         }
23738
23739         /* Here, the character is not something typically in a name,  But these
23740          * two types of characters (and the '_' above) can be freely ignored in
23741          * most situations.  Later it may turn out we shouldn't have ignored
23742          * them, and we have to reparse, but we don't have enough information
23743          * yet to make that decision */
23744         if (cur == '-' || isSPACE_A(cur)) {
23745             could_be_user_defined = FALSE;
23746             continue;
23747         }
23748
23749         /* An equals sign or single colon mark the end of the first part of
23750          * the property name */
23751         if (    cur == '='
23752             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23753         {
23754             lookup_name[j++] = '='; /* Treat the colon as an '=' */
23755             equals_pos = j; /* Note where it occurred in the input */
23756             could_be_user_defined = FALSE;
23757             break;
23758         }
23759
23760         /* If this looks like it is a marker we inserted at compile time,
23761          * set a flag and otherwise ignore it.  If it isn't in the final
23762          * position, keep it as it would have been user input. */
23763         if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23764             && ! deferrable
23765             &&   could_be_user_defined
23766             &&   i == name_len - 1)
23767         {
23768             name_len--;
23769             could_be_deferred_official = TRUE;
23770             continue;
23771         }
23772
23773         /* Otherwise, this character is part of the name. */
23774         lookup_name[j++] = cur;
23775
23776         /* Here it isn't a single colon, so if it is a colon, it must be a
23777          * double colon */
23778         if (cur == ':') {
23779
23780             /* A double colon should be a package qualifier.  We note its
23781              * position and continue.  Note that one could have
23782              *      pkg1::pkg2::...::foo
23783              * so that the position at the end of the loop will be just after
23784              * the final qualifier */
23785
23786             i++;
23787             non_pkg_begin = i + 1;
23788             lookup_name[j++] = ':';
23789             lun_non_pkg_begin = j;
23790         }
23791         else { /* Only word chars (and '::') can be in a user-defined name */
23792             could_be_user_defined = FALSE;
23793         }
23794     } /* End of parsing through the lhs of the property name (or all of it if
23795          no rhs) */
23796
23797 #  define STRLENs(s)  (sizeof("" s "") - 1)
23798
23799     /* If there is a single package name 'utf8::', it is ambiguous.  It could
23800      * be for a user-defined property, or it could be a Unicode property, as
23801      * all of them are considered to be for that package.  For the purposes of
23802      * parsing the rest of the property, strip it off */
23803     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23804         lookup_name +=  STRLENs("utf8::");
23805         j -=  STRLENs("utf8::");
23806         equals_pos -=  STRLENs("utf8::");
23807         stripped_utf8_pkg = TRUE;
23808     }
23809
23810     /* Here, we are either done with the whole property name, if it was simple;
23811      * or are positioned just after the '=' if it is compound. */
23812
23813     if (equals_pos >= 0) {
23814         assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23815
23816         /* Space immediately after the '=' is ignored */
23817         i++;
23818         for (; i < name_len; i++) {
23819             if (! isSPACE_A(name[i])) {
23820                 break;
23821             }
23822         }
23823
23824         /* Most punctuation after the equals indicates a subpattern, like
23825          * \p{foo=/bar/} */
23826         if (   isPUNCT_A(name[i])
23827             &&  name[i] != '-'
23828             &&  name[i] != '+'
23829             &&  name[i] != '_'
23830             &&  name[i] != '{'
23831                 /* A backslash means the real delimitter is the next character,
23832                  * but it must be punctuation */
23833             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23834         {
23835             bool special_property = memEQs(lookup_name, j - 1, "name")
23836                                  || memEQs(lookup_name, j - 1, "na");
23837             if (! special_property) {
23838                 /* Find the property.  The table includes the equals sign, so
23839                  * we use 'j' as-is */
23840                 table_index = do_uniprop_match(lookup_name, j);
23841             }
23842             if (special_property || table_index) {
23843                 REGEXP * subpattern_re;
23844                 char open = name[i++];
23845                 char close;
23846                 const char * pos_in_brackets;
23847                 const char * const * prop_values;
23848                 bool escaped = 0;
23849
23850                 /* Backslash => delimitter is the character following.  We
23851                  * already checked that it is punctuation */
23852                 if (open == '\\') {
23853                     open = name[i++];
23854                     escaped = 1;
23855                 }
23856
23857                 /* This data structure is constructed so that the matching
23858                  * closing bracket is 3 past its matching opening.  The second
23859                  * set of closing is so that if the opening is something like
23860                  * ']', the closing will be that as well.  Something similar is
23861                  * done in toke.c */
23862                 pos_in_brackets = memCHRs("([<)]>)]>", open);
23863                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23864
23865                 if (    i >= name_len
23866                     ||  name[name_len-1] != close
23867                     || (escaped && name[name_len-2] != '\\')
23868                         /* Also make sure that there are enough characters.
23869                          * e.g., '\\\' would show up incorrectly as legal even
23870                          * though it is too short */
23871                     || (SSize_t) (name_len - i - 1 - escaped) < 0)
23872                 {
23873                     sv_catpvs(msg, "Unicode property wildcard not terminated");
23874                     goto append_name_to_msg;
23875                 }
23876
23877                 Perl_ck_warner_d(aTHX_
23878                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23879                     "The Unicode property wildcards feature is experimental");
23880
23881                 if (special_property) {
23882                     const char * error_msg;
23883                     const char * revised_name = name + i;
23884                     Size_t revised_name_len = name_len - (i + 1 + escaped);
23885
23886                     /* Currently, the only 'special_property' is name, which we
23887                      * lookup in _charnames.pm */
23888
23889                     if (! load_charnames(newSVpvs("placeholder"),
23890                                          revised_name, revised_name_len,
23891                                          &error_msg))
23892                     {
23893                         sv_catpv(msg, error_msg);
23894                         goto append_name_to_msg;
23895                     }
23896
23897                     /* Farm this out to a function just to make the current
23898                      * function less unwieldy */
23899                     if (handle_names_wildcard(revised_name, revised_name_len,
23900                                               &prop_definition,
23901                                               strings))
23902                     {
23903                         return prop_definition;
23904                     }
23905
23906                     goto failed;
23907                 }
23908
23909                 prop_values = get_prop_values(table_index);
23910
23911                 /* Now create and compile the wildcard subpattern.  Use /i
23912                  * because the property values are supposed to match with case
23913                  * ignored. */
23914                 subpattern_re = compile_wildcard(name + i,
23915                                                  name_len - i - 1 - escaped,
23916                                                  TRUE /* /i */
23917                                                 );
23918
23919                 /* For each legal property value, see if the supplied pattern
23920                  * matches it. */
23921                 while (*prop_values) {
23922                     const char * const entry = *prop_values;
23923                     const Size_t len = strlen(entry);
23924                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23925
23926                     if (execute_wildcard(subpattern_re,
23927                                  (char *) entry,
23928                                  (char *) entry + len,
23929                                  (char *) entry, 0,
23930                                  entry_sv,
23931                                  0))
23932                     { /* Here, matched.  Add to the returned list */
23933                         Size_t total_len = j + len;
23934                         SV * sub_invlist = NULL;
23935                         char * this_string;
23936
23937                         /* We know this is a legal \p{property=value}.  Call
23938                          * the function to return the list of code points that
23939                          * match it */
23940                         Newxz(this_string, total_len + 1, char);
23941                         Copy(lookup_name, this_string, j, char);
23942                         my_strlcat(this_string, entry, total_len + 1);
23943                         SAVEFREEPV(this_string);
23944                         sub_invlist = parse_uniprop_string(this_string,
23945                                                            total_len,
23946                                                            is_utf8,
23947                                                            to_fold,
23948                                                            runtime,
23949                                                            deferrable,
23950                                                            NULL,
23951                                                            user_defined_ptr,
23952                                                            msg,
23953                                                            level + 1);
23954                         _invlist_union(prop_definition, sub_invlist,
23955                                        &prop_definition);
23956                     }
23957
23958                     prop_values++;  /* Next iteration, look at next propvalue */
23959                 } /* End of looking through property values; (the data
23960                      structure is terminated by a NULL ptr) */
23961
23962                 SvREFCNT_dec_NN(subpattern_re);
23963
23964                 if (prop_definition) {
23965                     return prop_definition;
23966                 }
23967
23968                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23969                 goto append_name_to_msg;
23970             }
23971
23972             /* Here's how khw thinks we should proceed to handle the properties
23973              * not yet done:    Bidi Mirroring Glyph        can map to ""
23974                                 Bidi Paired Bracket         can map to ""
23975                                 Case Folding  (both full and simple)
23976                                             Shouldn't /i be good enough for Full
23977                                 Decomposition Mapping
23978                                 Equivalent Unified Ideograph    can map to ""
23979                                 Lowercase Mapping  (both full and simple)
23980                                 NFKC Case Fold                  can map to ""
23981                                 Titlecase Mapping  (both full and simple)
23982                                 Uppercase Mapping  (both full and simple)
23983              * Handle these the same way Name is done, using say, _wild.pm, but
23984              * having both loose and full, like in charclass_invlists.h.
23985              * Perhaps move block and script to that as they are somewhat large
23986              * in charclass_invlists.h.
23987              * For properties where the default is the code point itself, such
23988              * as any of the case changing mappings, the string would otherwise
23989              * consist of all Unicode code points in UTF-8 strung together.
23990              * This would be impractical.  So instead, examine their compiled
23991              * pattern, looking at the ssc.  If none, reject the pattern as an
23992              * error.  Otherwise run the pattern against every code point in
23993              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
23994              * And it might be good to create an API to return the ssc.
23995              * Or handle them like the algorithmic names are done
23996              */
23997         } /* End of is a wildcard subppattern */
23998
23999         /* \p{name=...} is handled specially.  Instead of using the normal
24000          * mechanism involving charclass_invlists.h, it uses _charnames.pm
24001          * which has the necessary (huge) data accessible to it, and which
24002          * doesn't get loaded unless necessary.  The legal syntax for names is
24003          * somewhat different than other properties due both to the vagaries of
24004          * a few outlier official names, and the fact that only a few ASCII
24005          * characters are permitted in them */
24006         if (   memEQs(lookup_name, j - 1, "name")
24007             || memEQs(lookup_name, j - 1, "na"))
24008         {
24009             dSP;
24010             HV * table;
24011             SV * character;
24012             const char * error_msg;
24013             CV* lookup_loose;
24014             SV * character_name;
24015             STRLEN character_len;
24016             UV cp;
24017
24018             stricter = As_Is;
24019
24020             /* Since the RHS (after skipping initial space) is passed unchanged
24021              * to charnames, and there are different criteria for what are
24022              * legal characters in the name, just parse it here.  A character
24023              * name must begin with an ASCII alphabetic */
24024             if (! isALPHA(name[i])) {
24025                 goto failed;
24026             }
24027             lookup_name[j++] = name[i];
24028
24029             for (++i; i < name_len; i++) {
24030                 /* Official names can only be in the ASCII range, and only
24031                  * certain characters */
24032                 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24033                     goto failed;
24034                 }
24035                 lookup_name[j++] = name[i];
24036             }
24037
24038             /* Finished parsing, save the name into an SV */
24039             character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24040
24041             /* Make sure _charnames is loaded.  (The parameters give context
24042              * for any errors generated */
24043             table = load_charnames(character_name, name, name_len, &error_msg);
24044             if (table == NULL) {
24045                 sv_catpv(msg, error_msg);
24046                 goto append_name_to_msg;
24047             }
24048
24049             lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24050             if (! lookup_loose) {
24051                 Perl_croak(aTHX_
24052                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
24053             }
24054
24055             PUSHSTACKi(PERLSI_REGCOMP);
24056             ENTER ;
24057             SAVETMPS;
24058             save_re_context();
24059
24060             PUSHMARK(SP) ;
24061             XPUSHs(character_name);
24062             PUTBACK;
24063             call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24064
24065             SPAGAIN ;
24066
24067             character = POPs;
24068             SvREFCNT_inc_simple_void_NN(character);
24069
24070             PUTBACK ;
24071             FREETMPS ;
24072             LEAVE ;
24073             POPSTACK;
24074
24075             if (! SvOK(character)) {
24076                 goto failed;
24077             }
24078
24079             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24080             if (character_len == SvCUR(character)) {
24081                 prop_definition = add_cp_to_invlist(NULL, cp);
24082             }
24083             else {
24084                 AV * this_string;
24085
24086                 /* First of the remaining characters in the string. */
24087                 char * remaining = SvPVX(character) + character_len;
24088
24089                 if (strings == NULL) {
24090                     goto failed;    /* XXX Perhaps a specific msg instead, like
24091                                        'not available here' */
24092                 }
24093
24094                 if (*strings == NULL) {
24095                     *strings = newAV();
24096                 }
24097
24098                 this_string = newAV();
24099                 av_push(this_string, newSVuv(cp));
24100
24101                 do {
24102                     cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24103                     av_push(this_string, newSVuv(cp));
24104                     remaining += character_len;
24105                 } while (remaining < SvEND(character));
24106
24107                 av_push(*strings, (SV *) this_string);
24108             }
24109
24110             return prop_definition;
24111         }
24112
24113         /* Certain properties whose values are numeric need special handling.
24114          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
24115          * purposes of checking if this is one of those properties */
24116         if (memBEGINPs(lookup_name, j, "is")) {
24117             lookup_offset = 2;
24118         }
24119
24120         /* Then check if it is one of these specially-handled properties.  The
24121          * possibilities are hard-coded because easier this way, and the list
24122          * is unlikely to change.
24123          *
24124          * All numeric value type properties are of this ilk, and are also
24125          * special in a different way later on.  So find those first.  There
24126          * are several numeric value type properties in the Unihan DB (which is
24127          * unlikely to be compiled with perl, but we handle it here in case it
24128          * does get compiled).  They all end with 'numeric'.  The interiors
24129          * aren't checked for the precise property.  This would stop working if
24130          * a cjk property were to be created that ended with 'numeric' and
24131          * wasn't a numeric type */
24132         is_nv_type = memEQs(lookup_name + lookup_offset,
24133                        j - 1 - lookup_offset, "numericvalue")
24134                   || memEQs(lookup_name + lookup_offset,
24135                       j - 1 - lookup_offset, "nv")
24136                   || (   memENDPs(lookup_name + lookup_offset,
24137                             j - 1 - lookup_offset, "numeric")
24138                       && (   memBEGINPs(lookup_name + lookup_offset,
24139                                       j - 1 - lookup_offset, "cjk")
24140                           || memBEGINPs(lookup_name + lookup_offset,
24141                                       j - 1 - lookup_offset, "k")));
24142         if (   is_nv_type
24143             || memEQs(lookup_name + lookup_offset,
24144                       j - 1 - lookup_offset, "canonicalcombiningclass")
24145             || memEQs(lookup_name + lookup_offset,
24146                       j - 1 - lookup_offset, "ccc")
24147             || memEQs(lookup_name + lookup_offset,
24148                       j - 1 - lookup_offset, "age")
24149             || memEQs(lookup_name + lookup_offset,
24150                       j - 1 - lookup_offset, "in")
24151             || memEQs(lookup_name + lookup_offset,
24152                       j - 1 - lookup_offset, "presentin"))
24153         {
24154             unsigned int k;
24155
24156             /* Since the stuff after the '=' is a number, we can't throw away
24157              * '-' willy-nilly, as those could be a minus sign.  Other stricter
24158              * rules also apply.  However, these properties all can have the
24159              * rhs not be a number, in which case they contain at least one
24160              * alphabetic.  In those cases, the stricter rules don't apply.
24161              * But the numeric type properties can have the alphas [Ee] to
24162              * signify an exponent, and it is still a number with stricter
24163              * rules.  So look for an alpha that signifies not-strict */
24164             stricter = Strict;
24165             for (k = i; k < name_len; k++) {
24166                 if (   isALPHA_A(name[k])
24167                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24168                 {
24169                     stricter = Not_Strict;
24170                     break;
24171                 }
24172             }
24173         }
24174
24175         if (stricter) {
24176
24177             /* A number may have a leading '+' or '-'.  The latter is retained
24178              * */
24179             if (name[i] == '+') {
24180                 i++;
24181             }
24182             else if (name[i] == '-') {
24183                 lookup_name[j++] = '-';
24184                 i++;
24185             }
24186
24187             /* Skip leading zeros including single underscores separating the
24188              * zeros, or between the final leading zero and the first other
24189              * digit */
24190             for (; i < name_len - 1; i++) {
24191                 if (    name[i] != '0'
24192                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24193                 {
24194                     break;
24195                 }
24196             }
24197         }
24198     }
24199     else {  /* No '=' */
24200
24201        /* Only a few properties without an '=' should be parsed with stricter
24202         * rules.  The list is unlikely to change. */
24203         if (   memBEGINPs(lookup_name, j, "perl")
24204             && memNEs(lookup_name + 4, j - 4, "space")
24205             && memNEs(lookup_name + 4, j - 4, "word"))
24206         {
24207             stricter = Strict;
24208
24209             /* We set the inputs back to 0 and the code below will reparse,
24210              * using strict */
24211             i = j = 0;
24212         }
24213     }
24214
24215     /* Here, we have either finished the property, or are positioned to parse
24216      * the remainder, and we know if stricter rules apply.  Finish out, if not
24217      * already done */
24218     for (; i < name_len; i++) {
24219         char cur = name[i];
24220
24221         /* In all instances, case differences are ignored, and we normalize to
24222          * lowercase */
24223         if (isUPPER_A(cur)) {
24224             lookup_name[j++] = toLOWER(cur);
24225             continue;
24226         }
24227
24228         /* An underscore is skipped, but not under strict rules unless it
24229          * separates two digits */
24230         if (cur == '_') {
24231             if (    stricter
24232                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
24233                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24234             {
24235                 lookup_name[j++] = '_';
24236             }
24237             continue;
24238         }
24239
24240         /* Hyphens are skipped except under strict */
24241         if (cur == '-' && ! stricter) {
24242             continue;
24243         }
24244
24245         /* XXX Bug in documentation.  It says white space skipped adjacent to
24246          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
24247          * in a number */
24248         if (isSPACE_A(cur) && ! stricter) {
24249             continue;
24250         }
24251
24252         lookup_name[j++] = cur;
24253
24254         /* Unless this is a non-trailing slash, we are done with it */
24255         if (i >= name_len - 1 || cur != '/') {
24256             continue;
24257         }
24258
24259         slash_pos = j;
24260
24261         /* A slash in the 'numeric value' property indicates that what follows
24262          * is a denominator.  It can have a leading '+' and '0's that should be
24263          * skipped.  But we have never allowed a negative denominator, so treat
24264          * a minus like every other character.  (No need to rule out a second
24265          * '/', as that won't match anything anyway */
24266         if (is_nv_type) {
24267             i++;
24268             if (i < name_len && name[i] == '+') {
24269                 i++;
24270             }
24271
24272             /* Skip leading zeros including underscores separating digits */
24273             for (; i < name_len - 1; i++) {
24274                 if (   name[i] != '0'
24275                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24276                 {
24277                     break;
24278                 }
24279             }
24280
24281             /* Store the first real character in the denominator */
24282             if (i < name_len) {
24283                 lookup_name[j++] = name[i];
24284             }
24285         }
24286     }
24287
24288     /* Here are completely done parsing the input 'name', and 'lookup_name'
24289      * contains a copy, normalized.
24290      *
24291      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24292      * different from without the underscores.  */
24293     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
24294            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24295         && UNLIKELY(name[name_len-1] == '_'))
24296     {
24297         lookup_name[j++] = '&';
24298     }
24299
24300     /* If the original input began with 'In' or 'Is', it could be a subroutine
24301      * call to a user-defined property instead of a Unicode property name. */
24302     if (    name_len - non_pkg_begin > 2
24303         &&  name[non_pkg_begin+0] == 'I'
24304         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24305     {
24306         /* Names that start with In have different characterstics than those
24307          * that start with Is */
24308         if (name[non_pkg_begin+1] == 's') {
24309             starts_with_Is = TRUE;
24310         }
24311     }
24312     else {
24313         could_be_user_defined = FALSE;
24314     }
24315
24316     if (could_be_user_defined) {
24317         CV* user_sub;
24318
24319         /* If the user defined property returns the empty string, it could
24320          * easily be because the pattern is being compiled before the data it
24321          * actually needs to compile is available.  This could be argued to be
24322          * a bug in the perl code, but this is a change of behavior for Perl,
24323          * so we handle it.  This means that intentionally returning nothing
24324          * will not be resolved until runtime */
24325         bool empty_return = FALSE;
24326
24327         /* Here, the name could be for a user defined property, which are
24328          * implemented as subs. */
24329         user_sub = get_cvn_flags(name, name_len, 0);
24330         if (! user_sub) {
24331
24332             /* Here, the property name could be a user-defined one, but there
24333              * is no subroutine to handle it (as of now).   Defer handling it
24334              * until runtime.  Otherwise, a block defined by Unicode in a later
24335              * release would get the synonym InFoo added for it, and existing
24336              * code that used that name would suddenly break if it referred to
24337              * the property before the sub was declared.  See [perl #134146] */
24338             if (deferrable) {
24339                 goto definition_deferred;
24340             }
24341
24342             /* Here, we are at runtime, and didn't find the user property.  It
24343              * could be an official property, but only if no package was
24344              * specified, or just the utf8:: package. */
24345             if (could_be_deferred_official) {
24346                 lookup_name += lun_non_pkg_begin;
24347                 j -= lun_non_pkg_begin;
24348             }
24349             else if (! stripped_utf8_pkg) {
24350                 goto unknown_user_defined;
24351             }
24352
24353             /* Drop down to look up in the official properties */
24354         }
24355         else {
24356             const char insecure[] = "Insecure user-defined property";
24357
24358             /* Here, there is a sub by the correct name.  Normally we call it
24359              * to get the property definition */
24360             dSP;
24361             SV * user_sub_sv = MUTABLE_SV(user_sub);
24362             SV * error;     /* Any error returned by calling 'user_sub' */
24363             SV * key;       /* The key into the hash of user defined sub names
24364                              */
24365             SV * placeholder;
24366             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
24367
24368             /* How many times to retry when another thread is in the middle of
24369              * expanding the same definition we want */
24370             PERL_INT_FAST8_T retry_countdown = 10;
24371
24372             DECLARATION_FOR_GLOBAL_CONTEXT;
24373
24374             /* If we get here, we know this property is user-defined */
24375             *user_defined_ptr = TRUE;
24376
24377             /* We refuse to call a potentially tainted subroutine; returning an
24378              * error instead */
24379             if (TAINT_get) {
24380                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24381                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24382                 goto append_name_to_msg;
24383             }
24384
24385             /* In principal, we only call each subroutine property definition
24386              * once during the life of the program.  This guarantees that the
24387              * property definition never changes.  The results of the single
24388              * sub call are stored in a hash, which is used instead for future
24389              * references to this property.  The property definition is thus
24390              * immutable.  But, to allow the user to have a /i-dependent
24391              * definition, we call the sub once for non-/i, and once for /i,
24392              * should the need arise, passing the /i status as a parameter.
24393              *
24394              * We start by constructing the hash key name, consisting of the
24395              * fully qualified subroutine name, preceded by the /i status, so
24396              * that there is a key for /i and a different key for non-/i */
24397             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24398             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24399                                           non_pkg_begin != 0);
24400             sv_catsv(key, fq_name);
24401             sv_2mortal(key);
24402
24403             /* We only call the sub once throughout the life of the program
24404              * (with the /i, non-/i exception noted above).  That means the
24405              * hash must be global and accessible to all threads.  It is
24406              * created at program start-up, before any threads are created, so
24407              * is accessible to all children.  But this creates some
24408              * complications.
24409              *
24410              * 1) The keys can't be shared, or else problems arise; sharing is
24411              *    turned off at hash creation time
24412              * 2) All SVs in it are there for the remainder of the life of the
24413              *    program, and must be created in the same interpreter context
24414              *    as the hash, or else they will be freed from the wrong pool
24415              *    at global destruction time.  This is handled by switching to
24416              *    the hash's context to create each SV going into it, and then
24417              *    immediately switching back
24418              * 3) All accesses to the hash must be controlled by a mutex, to
24419              *    prevent two threads from getting an unstable state should
24420              *    they simultaneously be accessing it.  The code below is
24421              *    crafted so that the mutex is locked whenever there is an
24422              *    access and unlocked only when the next stable state is
24423              *    achieved.
24424              *
24425              * The hash stores either the definition of the property if it was
24426              * valid, or, if invalid, the error message that was raised.  We
24427              * use the type of SV to distinguish.
24428              *
24429              * There's also the need to guard against the definition expansion
24430              * from infinitely recursing.  This is handled by storing the aTHX
24431              * of the expanding thread during the expansion.  Again the SV type
24432              * is used to distinguish this from the other two cases.  If we
24433              * come to here and the hash entry for this property is our aTHX,
24434              * it means we have recursed, and the code assumes that we would
24435              * infinitely recurse, so instead stops and raises an error.
24436              * (Any recursion has always been treated as infinite recursion in
24437              * this feature.)
24438              *
24439              * If instead, the entry is for a different aTHX, it means that
24440              * that thread has gotten here first, and hasn't finished expanding
24441              * the definition yet.  We just have to wait until it is done.  We
24442              * sleep and retry a few times, returning an error if the other
24443              * thread doesn't complete. */
24444
24445           re_fetch:
24446             USER_PROP_MUTEX_LOCK;
24447
24448             /* If we have an entry for this key, the subroutine has already
24449              * been called once with this /i status. */
24450             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24451                                                    SvPVX(key), SvCUR(key), 0);
24452             if (saved_user_prop_ptr) {
24453
24454                 /* If the saved result is an inversion list, it is the valid
24455                  * definition of this property */
24456                 if (is_invlist(*saved_user_prop_ptr)) {
24457                     prop_definition = *saved_user_prop_ptr;
24458
24459                     /* The SV in the hash won't be removed until global
24460                      * destruction, so it is stable and we can unlock */
24461                     USER_PROP_MUTEX_UNLOCK;
24462
24463                     /* The caller shouldn't try to free this SV */
24464                     return prop_definition;
24465                 }
24466
24467                 /* Otherwise, if it is a string, it is the error message
24468                  * that was returned when we first tried to evaluate this
24469                  * property.  Fail, and append the message */
24470                 if (SvPOK(*saved_user_prop_ptr)) {
24471                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24472                     sv_catsv(msg, *saved_user_prop_ptr);
24473
24474                     /* The SV in the hash won't be removed until global
24475                      * destruction, so it is stable and we can unlock */
24476                     USER_PROP_MUTEX_UNLOCK;
24477
24478                     return NULL;
24479                 }
24480
24481                 assert(SvIOK(*saved_user_prop_ptr));
24482
24483                 /* Here, we have an unstable entry in the hash.  Either another
24484                  * thread is in the middle of expanding the property's
24485                  * definition, or we are ourselves recursing.  We use the aTHX
24486                  * in it to distinguish */
24487                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24488
24489                     /* Here, it's another thread doing the expanding.  We've
24490                      * looked as much as we are going to at the contents of the
24491                      * hash entry.  It's safe to unlock. */
24492                     USER_PROP_MUTEX_UNLOCK;
24493
24494                     /* Retry a few times */
24495                     if (retry_countdown-- > 0) {
24496                         PerlProc_sleep(1);
24497                         goto re_fetch;
24498                     }
24499
24500                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24501                     sv_catpvs(msg, "Timeout waiting for another thread to "
24502                                    "define");
24503                     goto append_name_to_msg;
24504                 }
24505
24506                 /* Here, we are recursing; don't dig any deeper */
24507                 USER_PROP_MUTEX_UNLOCK;
24508
24509                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24510                 sv_catpvs(msg,
24511                           "Infinite recursion in user-defined property");
24512                 goto append_name_to_msg;
24513             }
24514
24515             /* Here, this thread has exclusive control, and there is no entry
24516              * for this property in the hash.  So we have the go ahead to
24517              * expand the definition ourselves. */
24518
24519             PUSHSTACKi(PERLSI_REGCOMP);
24520             ENTER;
24521
24522             /* Create a temporary placeholder in the hash to detect recursion
24523              * */
24524             SWITCH_TO_GLOBAL_CONTEXT;
24525             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24526             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24527             RESTORE_CONTEXT;
24528
24529             /* Now that we have a placeholder, we can let other threads
24530              * continue */
24531             USER_PROP_MUTEX_UNLOCK;
24532
24533             /* Make sure the placeholder always gets destroyed */
24534             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24535
24536             PUSHMARK(SP);
24537             SAVETMPS;
24538
24539             /* Call the user's function, with the /i status as a parameter.
24540              * Note that we have gone to a lot of trouble to keep this call
24541              * from being within the locked mutex region. */
24542             XPUSHs(boolSV(to_fold));
24543             PUTBACK;
24544
24545             /* The following block was taken from swash_init().  Presumably
24546              * they apply to here as well, though we no longer use a swash --
24547              * khw */
24548             SAVEHINTS();
24549             save_re_context();
24550             /* We might get here via a subroutine signature which uses a utf8
24551              * parameter name, at which point PL_subname will have been set
24552              * but not yet used. */
24553             save_item(PL_subname);
24554
24555             /* G_SCALAR guarantees a single return value */
24556             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24557
24558             SPAGAIN;
24559
24560             error = ERRSV;
24561             if (TAINT_get || SvTRUE(error)) {
24562                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24563                 if (SvTRUE(error)) {
24564                     sv_catpvs(msg, "Error \"");
24565                     sv_catsv(msg, error);
24566                     sv_catpvs(msg, "\"");
24567                 }
24568                 if (TAINT_get) {
24569                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
24570                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24571                 }
24572
24573                 if (name_len > 0) {
24574                     sv_catpvs(msg, " in expansion of ");
24575                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24576                                                                   name_len,
24577                                                                   name));
24578                 }
24579
24580                 (void) POPs;
24581                 prop_definition = NULL;
24582             }
24583             else {
24584                 SV * contents = POPs;
24585
24586                 /* The contents is supposed to be the expansion of the property
24587                  * definition.  If the definition is deferrable, and we got an
24588                  * empty string back, set a flag to later defer it (after clean
24589                  * up below). */
24590                 if (      deferrable
24591                     && (! SvPOK(contents) || SvCUR(contents) == 0))
24592                 {
24593                         empty_return = TRUE;
24594                 }
24595                 else { /* Otherwise, call a function to check for valid syntax,
24596                           and handle it */
24597
24598                     prop_definition = handle_user_defined_property(
24599                                                     name, name_len,
24600                                                     is_utf8, to_fold, runtime,
24601                                                     deferrable,
24602                                                     contents, user_defined_ptr,
24603                                                     msg,
24604                                                     level);
24605                 }
24606             }
24607
24608             /* Here, we have the results of the expansion.  Delete the
24609              * placeholder, and if the definition is now known, replace it with
24610              * that definition.  We need exclusive access to the hash, and we
24611              * can't let anyone else in, between when we delete the placeholder
24612              * and add the permanent entry */
24613             USER_PROP_MUTEX_LOCK;
24614
24615             S_delete_recursion_entry(aTHX_ SvPVX(key));
24616
24617             if (    ! empty_return
24618                 && (! prop_definition || is_invlist(prop_definition)))
24619             {
24620                 /* If we got success we use the inversion list defining the
24621                  * property; otherwise use the error message */
24622                 SWITCH_TO_GLOBAL_CONTEXT;
24623                 (void) hv_store_ent(PL_user_def_props,
24624                                     key,
24625                                     ((prop_definition)
24626                                      ? newSVsv(prop_definition)
24627                                      : newSVsv(msg)),
24628                                     0);
24629                 RESTORE_CONTEXT;
24630             }
24631
24632             /* All done, and the hash now has a permanent entry for this
24633              * property.  Give up exclusive control */
24634             USER_PROP_MUTEX_UNLOCK;
24635
24636             FREETMPS;
24637             LEAVE;
24638             POPSTACK;
24639
24640             if (empty_return) {
24641                 goto definition_deferred;
24642             }
24643
24644             if (prop_definition) {
24645
24646                 /* If the definition is for something not known at this time,
24647                  * we toss it, and go return the main property name, as that's
24648                  * the one the user will be aware of */
24649                 if (! is_invlist(prop_definition)) {
24650                     SvREFCNT_dec_NN(prop_definition);
24651                     goto definition_deferred;
24652                 }
24653
24654                 sv_2mortal(prop_definition);
24655             }
24656
24657             /* And return */
24658             return prop_definition;
24659
24660         }   /* End of calling the subroutine for the user-defined property */
24661     }       /* End of it could be a user-defined property */
24662
24663     /* Here it wasn't a user-defined property that is known at this time.  See
24664      * if it is a Unicode property */
24665
24666     lookup_len = j;     /* This is a more mnemonic name than 'j' */
24667
24668     /* Get the index into our pointer table of the inversion list corresponding
24669      * to the property */
24670     table_index = do_uniprop_match(lookup_name, lookup_len);
24671
24672     /* If it didn't find the property ... */
24673     if (table_index == 0) {
24674
24675         /* Try again stripping off any initial 'Is'.  This is because we
24676          * promise that an initial Is is optional.  The same isn't true of
24677          * names that start with 'In'.  Those can match only blocks, and the
24678          * lookup table already has those accounted for.  The lookup table also
24679          * has already accounted for Perl extensions (without and = sign)
24680          * starting with 'i's'. */
24681         if (starts_with_Is && equals_pos >= 0) {
24682             lookup_name += 2;
24683             lookup_len -= 2;
24684             equals_pos -= 2;
24685             slash_pos -= 2;
24686
24687             table_index = do_uniprop_match(lookup_name, lookup_len);
24688         }
24689
24690         if (table_index == 0) {
24691             char * canonical;
24692
24693             /* Here, we didn't find it.  If not a numeric type property, and
24694              * can't be a user-defined one, it isn't a legal property */
24695             if (! is_nv_type) {
24696                 if (! could_be_user_defined) {
24697                     goto failed;
24698                 }
24699
24700                 /* Here, the property name is legal as a user-defined one.   At
24701                  * compile time, it might just be that the subroutine for that
24702                  * property hasn't been encountered yet, but at runtime, it's
24703                  * an error to try to use an undefined one */
24704                 if (! deferrable) {
24705                     goto unknown_user_defined;;
24706                 }
24707
24708                 goto definition_deferred;
24709             } /* End of isn't a numeric type property */
24710
24711             /* The numeric type properties need more work to decide.  What we
24712              * do is make sure we have the number in canonical form and look
24713              * that up. */
24714
24715             if (slash_pos < 0) {    /* No slash */
24716
24717                 /* When it isn't a rational, take the input, convert it to a
24718                  * NV, then create a canonical string representation of that
24719                  * NV. */
24720
24721                 NV value;
24722                 SSize_t value_len = lookup_len - equals_pos;
24723
24724                 /* Get the value */
24725                 if (   value_len <= 0
24726                     || my_atof3(lookup_name + equals_pos, &value,
24727                                 value_len)
24728                           != lookup_name + lookup_len)
24729                 {
24730                     goto failed;
24731                 }
24732
24733                 /* If the value is an integer, the canonical value is integral
24734                  * */
24735                 if (Perl_ceil(value) == value) {
24736                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24737                                             equals_pos, lookup_name, value);
24738                 }
24739                 else {  /* Otherwise, it is %e with a known precision */
24740                     char * exp_ptr;
24741
24742                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24743                                                 equals_pos, lookup_name,
24744                                                 PL_E_FORMAT_PRECISION, value);
24745
24746                     /* The exponent generated is expecting two digits, whereas
24747                      * %e on some systems will generate three.  Remove leading
24748                      * zeros in excess of 2 from the exponent.  We start
24749                      * looking for them after the '=' */
24750                     exp_ptr = strchr(canonical + equals_pos, 'e');
24751                     if (exp_ptr) {
24752                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24753                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24754
24755                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24756
24757                         if (excess_exponent_len > 0) {
24758                             SSize_t leading_zeros = strspn(cur_ptr, "0");
24759                             SSize_t excess_leading_zeros
24760                                     = MIN(leading_zeros, excess_exponent_len);
24761                             if (excess_leading_zeros > 0) {
24762                                 Move(cur_ptr + excess_leading_zeros,
24763                                      cur_ptr,
24764                                      strlen(cur_ptr) - excess_leading_zeros
24765                                        + 1,  /* Copy the NUL as well */
24766                                      char);
24767                             }
24768                         }
24769                     }
24770                 }
24771             }
24772             else {  /* Has a slash.  Create a rational in canonical form  */
24773                 UV numerator, denominator, gcd, trial;
24774                 const char * end_ptr;
24775                 const char * sign = "";
24776
24777                 /* We can't just find the numerator, denominator, and do the
24778                  * division, then use the method above, because that is
24779                  * inexact.  And the input could be a rational that is within
24780                  * epsilon (given our precision) of a valid rational, and would
24781                  * then incorrectly compare valid.
24782                  *
24783                  * We're only interested in the part after the '=' */
24784                 const char * this_lookup_name = lookup_name + equals_pos;
24785                 lookup_len -= equals_pos;
24786                 slash_pos -= equals_pos;
24787
24788                 /* Handle any leading minus */
24789                 if (this_lookup_name[0] == '-') {
24790                     sign = "-";
24791                     this_lookup_name++;
24792                     lookup_len--;
24793                     slash_pos--;
24794                 }
24795
24796                 /* Convert the numerator to numeric */
24797                 end_ptr = this_lookup_name + slash_pos;
24798                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24799                     goto failed;
24800                 }
24801
24802                 /* It better have included all characters before the slash */
24803                 if (*end_ptr != '/') {
24804                     goto failed;
24805                 }
24806
24807                 /* Set to look at just the denominator */
24808                 this_lookup_name += slash_pos;
24809                 lookup_len -= slash_pos;
24810                 end_ptr = this_lookup_name + lookup_len;
24811
24812                 /* Convert the denominator to numeric */
24813                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24814                     goto failed;
24815                 }
24816
24817                 /* It better be the rest of the characters, and don't divide by
24818                  * 0 */
24819                 if (   end_ptr != this_lookup_name + lookup_len
24820                     || denominator == 0)
24821                 {
24822                     goto failed;
24823                 }
24824
24825                 /* Get the greatest common denominator using
24826                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
24827                 gcd = numerator;
24828                 trial = denominator;
24829                 while (trial != 0) {
24830                     UV temp = trial;
24831                     trial = gcd % trial;
24832                     gcd = temp;
24833                 }
24834
24835                 /* If already in lowest possible terms, we have already tried
24836                  * looking this up */
24837                 if (gcd == 1) {
24838                     goto failed;
24839                 }
24840
24841                 /* Reduce the rational, which should put it in canonical form
24842                  * */
24843                 numerator /= gcd;
24844                 denominator /= gcd;
24845
24846                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24847                         equals_pos, lookup_name, sign, numerator, denominator);
24848             }
24849
24850             /* Here, we have the number in canonical form.  Try that */
24851             table_index = do_uniprop_match(canonical, strlen(canonical));
24852             if (table_index == 0) {
24853                 goto failed;
24854             }
24855         }   /* End of still didn't find the property in our table */
24856     }       /* End of       didn't find the property in our table */
24857
24858     /* Here, we have a non-zero return, which is an index into a table of ptrs.
24859      * A negative return signifies that the real index is the absolute value,
24860      * but the result needs to be inverted */
24861     if (table_index < 0) {
24862         invert_return = TRUE;
24863         table_index = -table_index;
24864     }
24865
24866     /* Out-of band indices indicate a deprecated property.  The proper index is
24867      * modulo it with the table size.  And dividing by the table size yields
24868      * an offset into a table constructed by regen/mk_invlists.pl to contain
24869      * the corresponding warning message */
24870     if (table_index > MAX_UNI_KEYWORD_INDEX) {
24871         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24872         table_index %= MAX_UNI_KEYWORD_INDEX;
24873         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24874                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24875                 (int) name_len, name,
24876                 get_deprecated_property_msg(warning_offset));
24877     }
24878
24879     /* In a few properties, a different property is used under /i.  These are
24880      * unlikely to change, so are hard-coded here. */
24881     if (to_fold) {
24882         if (   table_index == UNI_XPOSIXUPPER
24883             || table_index == UNI_XPOSIXLOWER
24884             || table_index == UNI_TITLE)
24885         {
24886             table_index = UNI_CASED;
24887         }
24888         else if (   table_index == UNI_UPPERCASELETTER
24889                  || table_index == UNI_LOWERCASELETTER
24890 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
24891                  || table_index == UNI_TITLECASELETTER
24892 #  endif
24893         ) {
24894             table_index = UNI_CASEDLETTER;
24895         }
24896         else if (  table_index == UNI_POSIXUPPER
24897                 || table_index == UNI_POSIXLOWER)
24898         {
24899             table_index = UNI_POSIXALPHA;
24900         }
24901     }
24902
24903     /* Create and return the inversion list */
24904     prop_definition = get_prop_definition(table_index);
24905     sv_2mortal(prop_definition);
24906
24907     /* See if there is a private use override to add to this definition */
24908     {
24909         COPHH * hinthash = (IN_PERL_COMPILETIME)
24910                            ? CopHINTHASH_get(&PL_compiling)
24911                            : CopHINTHASH_get(PL_curcop);
24912         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24913
24914         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24915
24916             /* See if there is an element in the hints hash for this table */
24917             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24918             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24919
24920             if (pos) {
24921                 bool dummy;
24922                 SV * pu_definition;
24923                 SV * pu_invlist;
24924                 SV * expanded_prop_definition =
24925                             sv_2mortal(invlist_clone(prop_definition, NULL));
24926
24927                 /* If so, it's definition is the string from here to the next
24928                  * \a character.  And its format is the same as a user-defined
24929                  * property */
24930                 pos += SvCUR(pu_lookup);
24931                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24932                 pu_invlist = handle_user_defined_property(lookup_name,
24933                                                           lookup_len,
24934                                                           0, /* Not UTF-8 */
24935                                                           0, /* Not folded */
24936                                                           runtime,
24937                                                           deferrable,
24938                                                           pu_definition,
24939                                                           &dummy,
24940                                                           msg,
24941                                                           level);
24942                 if (TAINT_get) {
24943                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24944                     sv_catpvs(msg, "Insecure private-use override");
24945                     goto append_name_to_msg;
24946                 }
24947
24948                 /* For now, as a safety measure, make sure that it doesn't
24949                  * override non-private use code points */
24950                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24951
24952                 /* Add it to the list to be returned */
24953                 _invlist_union(prop_definition, pu_invlist,
24954                                &expanded_prop_definition);
24955                 prop_definition = expanded_prop_definition;
24956                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24957             }
24958         }
24959     }
24960
24961     if (invert_return) {
24962         _invlist_invert(prop_definition);
24963     }
24964     return prop_definition;
24965
24966   unknown_user_defined:
24967     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24968     sv_catpvs(msg, "Unknown user-defined property name");
24969     goto append_name_to_msg;
24970
24971   failed:
24972     if (non_pkg_begin != 0) {
24973         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24974         sv_catpvs(msg, "Illegal user-defined property name");
24975     }
24976     else {
24977         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24978         sv_catpvs(msg, "Can't find Unicode property definition");
24979     }
24980     /* FALLTHROUGH */
24981
24982   append_name_to_msg:
24983     {
24984         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
24985         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
24986
24987         sv_catpv(msg, prefix);
24988         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24989         sv_catpv(msg, suffix);
24990     }
24991
24992     return NULL;
24993
24994   definition_deferred:
24995
24996     {
24997         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
24998
24999         /* Here it could yet to be defined, so defer evaluation of this until
25000          * its needed at runtime.  We need the fully qualified property name to
25001          * avoid ambiguity */
25002         if (! fq_name) {
25003             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25004                                                                 is_qualified);
25005         }
25006
25007         /* If it didn't come with a package, or the package is utf8::, this
25008          * actually could be an official Unicode property whose inclusion we
25009          * are deferring until runtime to make sure that it isn't overridden by
25010          * a user-defined property of the same name (which we haven't
25011          * encountered yet).  Add a marker to indicate this possibility, for
25012          * use at such time when we first need the definition during pattern
25013          * matching execution */
25014         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25015             sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25016         }
25017
25018         /* We also need a trailing newline */
25019         sv_catpvs(fq_name, "\n");
25020
25021         *user_defined_ptr = TRUE;
25022         return fq_name;
25023     }
25024 }
25025
25026 STATIC bool
25027 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25028                               const STRLEN wname_len, /* Its length */
25029                               SV ** prop_definition,
25030                               AV ** strings)
25031 {
25032     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25033      * any matches, adding them to prop_definition */
25034
25035     dSP;
25036
25037     CV * get_names_info;        /* entry to charnames.pm to get info we need */
25038     SV * names_string;          /* Contains all character names, except algo */
25039     SV * algorithmic_names;     /* Contains info about algorithmically
25040                                    generated character names */
25041     REGEXP * subpattern_re;     /* The user's pattern to match with */
25042     struct regexp * prog;       /* The compiled pattern */
25043     char * all_names_start;     /* lib/unicore/Name.pl string of every
25044                                    (non-algorithmic) character name */
25045     char * cur_pos;             /* We match, effectively using /gc; this is
25046                                    where we are now */
25047     bool found_matches = FALSE; /* Did any name match so far? */
25048     SV * empty;                 /* For matching zero length names */
25049     SV * must_sv;               /* Contains the substring, if any, that must be
25050                                    in a name for the subpattern to match */
25051     const char * must;          /* The PV of 'must' */
25052     STRLEN must_len;            /* And its length */
25053     SV * syllable_name = NULL;  /* For Hangul syllables */
25054     const char hangul_prefix[] = "HANGUL SYLLABLE ";
25055     const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25056
25057     /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25058      * syllable name, and these are immutable and guaranteed by the Unicode
25059      * standard to never be extended */
25060     const STRLEN syl_max_len = hangul_prefix_len + 7;
25061
25062     IV i;
25063
25064     PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25065
25066     /* Make sure _charnames is loaded.  (The parameters give context
25067      * for any errors generated */
25068     get_names_info = get_cv("_charnames::_get_names_info", 0);
25069     if (! get_names_info) {
25070         Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25071     }
25072
25073     /* Get the charnames data */
25074     PUSHSTACKi(PERLSI_REGCOMP);
25075     ENTER ;
25076     SAVETMPS;
25077     save_re_context();
25078
25079     PUSHMARK(SP) ;
25080     PUTBACK;
25081
25082     /* Special _charnames entry point that returns the info this routine
25083      * requires */
25084     call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25085
25086     SPAGAIN ;
25087
25088     /* Data structure for names which end in their very own code points */
25089     algorithmic_names = POPs;
25090     SvREFCNT_inc_simple_void_NN(algorithmic_names);
25091
25092     /* The lib/unicore/Name.pl string */
25093     names_string = POPs;
25094     SvREFCNT_inc_simple_void_NN(names_string);
25095
25096     PUTBACK ;
25097     FREETMPS ;
25098     LEAVE ;
25099     POPSTACK;
25100
25101     if (   ! SvROK(names_string)
25102         || ! SvROK(algorithmic_names))
25103     {   /* Perhaps should panic instead XXX */
25104         SvREFCNT_dec(names_string);
25105         SvREFCNT_dec(algorithmic_names);
25106         return FALSE;
25107     }
25108
25109     names_string = sv_2mortal(SvRV(names_string));
25110     all_names_start = SvPVX(names_string);
25111     cur_pos = all_names_start;
25112
25113     algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25114
25115     /* Compile the subpattern consisting of the name being looked for */
25116     subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25117
25118     must_sv = re_intuit_string(subpattern_re);
25119     if (must_sv) {
25120         /* regexec.c can free the re_intuit_string() return. GH #17734 */
25121         must_sv = sv_2mortal(newSVsv(must_sv));
25122         must = SvPV(must_sv, must_len);
25123     }
25124     else {
25125         must = "";
25126         must_len = 0;
25127     }
25128
25129     /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
25130      * This works because the NUL causes the function to return early, thus
25131      * showing that there are characters in it other than the acceptable ones,
25132      * which is our desired result.) */
25133
25134     prog = ReANY(subpattern_re);
25135
25136     /* If only nothing is matched, skip to where empty names are looked for */
25137     if (prog->maxlen == 0) {
25138         goto check_empty;
25139     }
25140
25141     /* And match against the string of all names /gc.  Don't even try if it
25142      * must match a character not found in any name. */
25143     if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25144     {
25145         while (execute_wildcard(subpattern_re,
25146                                 cur_pos,
25147                                 SvEND(names_string),
25148                                 all_names_start, 0,
25149                                 names_string,
25150                                 0))
25151         { /* Here, matched. */
25152
25153             /* Note the string entries look like
25154              *      00001\nSTART OF HEADING\n\n
25155              * so we could match anywhere in that string.  We have to rule out
25156              * matching a code point line */
25157             char * this_name_start = all_names_start
25158                                                 + RX_OFFS(subpattern_re)->start;
25159             char * this_name_end   = all_names_start
25160                                                 + RX_OFFS(subpattern_re)->end;
25161             char * cp_start;
25162             char * cp_end;
25163             UV cp = 0;      /* Silences some compilers */
25164             AV * this_string = NULL;
25165             bool is_multi = FALSE;
25166
25167             /* If matched nothing, advance to next possible match */
25168             if (this_name_start == this_name_end) {
25169                 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25170                                           SvEND(names_string) - this_name_end);
25171                 if (cur_pos == NULL) {
25172                     break;
25173                 }
25174             }
25175             else {
25176                 /* Position the next match to start beyond the current returned
25177                  * entry */
25178                 cur_pos = (char *) memchr(this_name_end, '\n',
25179                                           SvEND(names_string) - this_name_end);
25180             }
25181
25182             /* Back up to the \n just before the beginning of the character. */
25183             cp_end = (char *) my_memrchr(all_names_start,
25184                                          '\n',
25185                                          this_name_start - all_names_start);
25186
25187             /* If we didn't find a \n, it means it matched somewhere in the
25188              * initial '00000' in the string, so isn't a real match */
25189             if (cp_end == NULL) {
25190                 continue;
25191             }
25192
25193             this_name_start = cp_end + 1;   /* The name starts just after */
25194             cp_end--;                       /* the \n, and the code point */
25195                                             /* ends just before it */
25196
25197             /* All code points are 5 digits long */
25198             cp_start = cp_end - 4;
25199
25200             /* This shouldn't happen, as we found a \n, and the first \n is
25201              * further along than what we subtracted */
25202             assert(cp_start >= all_names_start);
25203
25204             if (cp_start == all_names_start) {
25205                 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25206                 continue;
25207             }
25208
25209             /* If the character is a blank, we either have a named sequence, or
25210              * something is wrong */
25211             if (*(cp_start - 1) == ' ') {
25212                 cp_start = (char *) my_memrchr(all_names_start,
25213                                                '\n',
25214                                                cp_start - all_names_start);
25215                 cp_start++;
25216             }
25217
25218             assert(cp_start != NULL && cp_start >= all_names_start + 2);
25219
25220             /* Except for the first line in the string, the sequence before the
25221              * code point is \n\n.  If that isn't the case here, we didn't
25222              * match the name of a character.  (We could have matched a named
25223              * sequence, not currently handled */
25224             if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25225                 continue;
25226             }
25227
25228             /* We matched!  Add this to the list */
25229             found_matches = TRUE;
25230
25231             /* Loop through all the code points in the sequence */
25232             while (cp_start < cp_end) {
25233
25234                 /* Calculate this code point from its 5 digits */
25235                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25236                    + (XDIGIT_VALUE(cp_start[1]) << 12)
25237                    + (XDIGIT_VALUE(cp_start[2]) << 8)
25238                    + (XDIGIT_VALUE(cp_start[3]) << 4)
25239                    +  XDIGIT_VALUE(cp_start[4]);
25240
25241                 cp_start += 6;  /* Go past any blank */
25242
25243                 if (cp_start < cp_end || is_multi) {
25244                     if (this_string == NULL) {
25245                         this_string = newAV();
25246                     }
25247
25248                     is_multi = TRUE;
25249                     av_push(this_string, newSVuv(cp));
25250                 }
25251             }
25252
25253             if (is_multi) { /* Was more than one code point */
25254                 if (*strings == NULL) {
25255                     *strings = newAV();
25256                 }
25257
25258                 av_push(*strings, (SV *) this_string);
25259             }
25260             else {  /* Only a single code point */
25261                 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25262             }
25263         } /* End of loop through the non-algorithmic names string */
25264     }
25265
25266     /* There are also character names not in 'names_string'.  These are
25267      * algorithmically generatable.  Try this pattern on each possible one.
25268      * (khw originally planned to leave this out given the large number of
25269      * matches attempted; but the speed turned out to be quite acceptable
25270      *
25271      * There are plenty of opportunities to optimize to skip many of the tests.
25272      * beyond the rudimentary ones already here */
25273
25274     /* First see if the subpattern matches any of the algorithmic generatable
25275      * Hangul syllable names.
25276      *
25277      * We know none of these syllable names will match if the input pattern
25278      * requires more bytes than any syllable has, or if the input pattern only
25279      * matches an empty name, or if the pattern has something it must match and
25280      * one of the characters in that isn't in any Hangul syllable. */
25281     if (    prog->minlen <= (SSize_t) syl_max_len
25282         &&  prog->maxlen > 0
25283         && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25284     {
25285         /* These constants, names, values, and algorithm are adapted from the
25286          * Unicode standard, version 5.1, section 3.12, and should never
25287          * change. */
25288         const char * JamoL[] = {
25289             "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25290             "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25291         };
25292         const int LCount = C_ARRAY_LENGTH(JamoL);
25293
25294         const char * JamoV[] = {
25295             "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25296             "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25297             "I"
25298         };
25299         const int VCount = C_ARRAY_LENGTH(JamoV);
25300
25301         const char * JamoT[] = {
25302             "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25303             "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25304             "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25305         };
25306         const int TCount = C_ARRAY_LENGTH(JamoT);
25307
25308         int L, V, T;
25309
25310         /* This is the initial Hangul syllable code point; each time through the
25311          * inner loop, it maps to the next higher code point.  For more info,
25312          * see the Hangul syllable section of the Unicode standard. */
25313         int cp = 0xAC00;
25314
25315         syllable_name = sv_2mortal(newSV(syl_max_len));
25316         sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25317
25318         for (L = 0; L < LCount; L++) {
25319             for (V = 0; V < VCount; V++) {
25320                 for (T = 0; T < TCount; T++) {
25321
25322                     /* Truncate back to the prefix, which is unvarying */
25323                     SvCUR_set(syllable_name, hangul_prefix_len);
25324
25325                     sv_catpv(syllable_name, JamoL[L]);
25326                     sv_catpv(syllable_name, JamoV[V]);
25327                     sv_catpv(syllable_name, JamoT[T]);
25328
25329                     if (execute_wildcard(subpattern_re,
25330                                 SvPVX(syllable_name),
25331                                 SvEND(syllable_name),
25332                                 SvPVX(syllable_name), 0,
25333                                 syllable_name,
25334                                 0))
25335                     {
25336                         *prop_definition = add_cp_to_invlist(*prop_definition,
25337                                                              cp);
25338                         found_matches = TRUE;
25339                     }
25340
25341                     cp++;
25342                 }
25343             }
25344         }
25345     }
25346
25347     /* The rest of the algorithmically generatable names are of the form
25348      * "PREFIX-code_point".  The prefixes and the code point limits of each
25349      * were returned to us in the array 'algorithmic_names' from data in
25350      * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
25351     for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25352         IV j;
25353
25354         /* Each element of the array is a hash, giving the details for the
25355          * series of names it covers.  There is the base name of the characters
25356          * in the series, and the low and high code points in the series.  And,
25357          * for optimization purposes a string containing all the legal
25358          * characters that could possibly be in a name in this series. */
25359         HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25360         SV * prefix = * hv_fetchs(this_series, "name", 0);
25361         IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25362         IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25363         char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25364
25365         /* Pre-allocate an SV with enough space */
25366         SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25367                                                         SvPVX(prefix)));
25368         if (high >= 0x10000) {
25369             sv_catpvs(algo_name, "0");
25370         }
25371
25372         /* This series can be skipped entirely if the pattern requires
25373          * something longer than any name in the series, or can only match an
25374          * empty name, or contains a character not found in any name in the
25375          * series */
25376         if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
25377             &&  prog->maxlen > 0
25378             && (strspn(must, legal) == must_len))
25379         {
25380             for (j = low; j <= high; j++) { /* For each code point in the series */
25381
25382                 /* Get its name, and see if it matches the subpattern */
25383                 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25384                                      (unsigned) j);
25385
25386                 if (execute_wildcard(subpattern_re,
25387                                     SvPVX(algo_name),
25388                                     SvEND(algo_name),
25389                                     SvPVX(algo_name), 0,
25390                                     algo_name,
25391                                     0))
25392                 {
25393                     *prop_definition = add_cp_to_invlist(*prop_definition, j);
25394                     found_matches = TRUE;
25395                 }
25396             }
25397         }
25398     }
25399
25400   check_empty:
25401     /* Finally, see if the subpattern matches an empty string */
25402     empty = newSVpvs("");
25403     if (execute_wildcard(subpattern_re,
25404                          SvPVX(empty),
25405                          SvEND(empty),
25406                          SvPVX(empty), 0,
25407                          empty,
25408                          0))
25409     {
25410         /* Many code points have empty names.  Currently these are the \p{GC=C}
25411          * ones, minus CC and CF */
25412
25413         SV * empty_names_ref = get_prop_definition(UNI_C);
25414         SV * empty_names = invlist_clone(empty_names_ref, NULL);
25415
25416         SV * subtract = get_prop_definition(UNI_CC);
25417
25418         _invlist_subtract(empty_names, subtract, &empty_names);
25419         SvREFCNT_dec_NN(empty_names_ref);
25420         SvREFCNT_dec_NN(subtract);
25421
25422         subtract = get_prop_definition(UNI_CF);
25423         _invlist_subtract(empty_names, subtract, &empty_names);
25424         SvREFCNT_dec_NN(subtract);
25425
25426         _invlist_union(*prop_definition, empty_names, prop_definition);
25427         found_matches = TRUE;
25428         SvREFCNT_dec_NN(empty_names);
25429     }
25430     SvREFCNT_dec_NN(empty);
25431
25432 #if 0
25433     /* If we ever were to accept aliases for, say private use names, we would
25434      * need to do something fancier to find empty names.  The code below works
25435      * (at the time it was written), and is slower than the above */
25436     const char empties_pat[] = "^.";
25437     if (strNE(name, empties_pat)) {
25438         SV * empty = newSVpvs("");
25439         if (execute_wildcard(subpattern_re,
25440                     SvPVX(empty),
25441                     SvEND(empty),
25442                     SvPVX(empty), 0,
25443                     empty,
25444                     0))
25445         {
25446             SV * empties = NULL;
25447
25448             (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25449
25450             _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25451             SvREFCNT_dec_NN(empties);
25452
25453             found_matches = TRUE;
25454         }
25455         SvREFCNT_dec_NN(empty);
25456     }
25457 #endif
25458
25459     SvREFCNT_dec_NN(subpattern_re);
25460     return found_matches;
25461 }
25462
25463 /*
25464  * ex: set ts=8 sts=4 sw=4 et:
25465  */