regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73
  74 /* Note on debug output:
  75  *
  76  * This is set up so that -Dr turns on debugging like all other flags that are
  77  * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
  78  * all regular expressions encountered in a program, and gives a huge amount of
  79  * output for all but the shortest programs.
  80  *
  81  * The ability to output pattern debugging information lexically, and with much
  82  * finer grained control was added, with 'use re qw(Debug ....);' available even
  83  * in non-DEBUGGING builds.  This is accomplished by copying the contents of
  84  * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
  85  * Those files are compiled and linked into the perl executable, and they are
  86  * compiled essentially as if DEBUGGING were enabled, and controlled by calls
  87  * to re.pm.
  88  *
  89  * That would normally mean linking errors when two functions of the same name
  90  * are attempted to be placed into the same executable.  That is solved in one
  91  * of four ways:
  92  *  1)  Static functions aren't known outside the file they are in, so for the
  93  *      many functions of that type in this file, it just isn't a problem.
  94  *  2)  Most externally known functions are enclosed in
  95  *          #ifndef PERL_IN_XSUB_RE
  96  *          ...
  97  *          #endif
  98  *      blocks, so there is only one defintion for them in the whole
  99  *      executable, the one in regcomp.c (or regexec.c).  The implication of
 100  *      that is any debugging info that comes from them is controlled only by
 101  *      -Dr.  Further, any static function they call will also be the version
 102  *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
 103  *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
 104  *      have different names, so that what gets loaded in the executable is
 105  *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
 106  *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
 107  *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
 108  *      versions and their callees are under control of re.pm.   The catch is
 109  *      that references to all these go through the regexp_engine structure,
 110  *      which is initialized in regcomp.h to the Perl_foo versions, and
 111  *      substituted out in lexical scopes where 'use re' is in effect to the
 112  *      'my_foo' ones.   That structure is public API, so it would be a hard
 113  *      sell to add any additional members.
 114  *  4)  For functions in regcomp.c and re_comp.c that are called only from,
 115  *      respectively, regexec.c and re_exec.c, they can have two different
 116  *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
 117  *      embed.fnc.
 118  *
 119  * The bottom line is that if you add code to one of the public functions
 120  * listed in ext/re/re_top.h, debugging automagically works.  But if you write
 121  * a new function that needs to do debugging or there is a chain of calls from
 122  * it that need to do debugging, all functions in the chain should use options
 123  * 2) or 4) above.
 124  *
 125  * A function may have to be split so that debugging stuff is static, but it
 126  * calls out to some other function that only gets compiled in regcomp.c to
 127  * access data that we don't want to duplicate.
 128  */
 129
 130 #include "EXTERN.h"
 131 #define PERL_IN_REGCOMP_C
 132 #include "perl.h"
 133
 134 #define REG_COMP_C
 135 #ifdef PERL_IN_XSUB_RE
 136 #  include "re_comp.h"
 137 EXTERN_C const struct regexp_engine my_reg_engine;
 138 EXTERN_C const struct regexp_engine wild_reg_engine;
 139 #else
 140 #  include "regcomp.h"
 141 #endif
 142
 143 #include "invlist_inline.h"
 144 #include "unicode_constants.h"
 145
 146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
 147  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
 149  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 152
 153 #ifndef STATIC
 154 #define STATIC  static
 155 #endif
 156
 157 /* this is a chain of data about sub patterns we are processing that
 158    need to be handled separately/specially in study_chunk. Its so
 159    we can simulate recursion without losing state.  */
 160 struct scan_frame;
 161 typedef struct scan_frame {
 162     regnode *last_regnode;      /* last node to process in this frame */
 163     regnode *next_regnode;      /* next node to process when last is reached */
 164     U32 prev_recursed_depth;
 165     I32 stopparen;              /* what stopparen do we use */
 166     bool in_gosub;              /* this or an outer frame is for GOSUB */
 167
 168     struct scan_frame *this_prev_frame; /* this previous frame */
 169     struct scan_frame *prev_frame;      /* previous frame */
 170     struct scan_frame *next_frame;      /* next frame */
 171 } scan_frame;
 172
 173 /* Certain characters are output as a sequence with the first being a
 174  * backslash. */
 175 #define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
 176
 177
 178 struct RExC_state_t {
 179     U32         flags;                  /* RXf_* are we folding, multilining? */
 180     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 181     char        *precomp;               /* uncompiled string. */
 182     char        *precomp_end;           /* pointer to end of uncompiled string. */
 183     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 184     regexp      *rx;                    /* perl core regexp structure */
 185     regexp_internal     *rxi;           /* internal data for regexp object
 186                                            pprivate field */
 187     char        *start;                 /* Start of input for compile */
 188     char        *end;                   /* End of input for compile */
 189     char        *parse;                 /* Input-scan pointer. */
 190     char        *copy_start;            /* start of copy of input within
 191                                            constructed parse string */
 192     char        *save_copy_start;       /* Provides one level of saving
 193                                            and restoring 'copy_start' */
 194     char        *copy_start_in_input;   /* Position in input string
 195                                            corresponding to copy_start */
 196     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 197     regnode     *emit_start;            /* Start of emitted-code area */
 198     regnode_offset emit;                /* Code-emit pointer */
 199     I32         naughty;                /* How bad is this pattern? */
 200     I32         sawback;                /* Did we see \1, ...? */
 201     SSize_t     size;                   /* Number of regnode equivalents in
 202                                            pattern */
 203     Size_t      sets_depth;              /* Counts recursion depth of already-
 204                                            compiled regex set patterns */
 205     U32         seen;
 206
 207     I32      parens_buf_size;           /* #slots malloced open/close_parens */
 208     regnode_offset *open_parens;        /* offsets to open parens */
 209     regnode_offset *close_parens;       /* offsets to close parens */
 210     HV          *paren_names;           /* Paren names */
 211
 212     /* position beyond 'precomp' of the warning message furthest away from
 213      * 'precomp'.  During the parse, no warnings are raised for any problems
 214      * earlier in the parse than this position.  This works if warnings are
 215      * raised the first time a given spot is parsed, and if only one
 216      * independent warning is raised for any given spot */
 217     Size_t      latest_warn_offset;
 218
 219     I32         npar;                   /* Capture buffer count so far in the
 220                                            parse, (OPEN) plus one. ("par" 0 is
 221                                            the whole pattern)*/
 222     I32         total_par;              /* During initial parse, is either 0,
 223                                            or -1; the latter indicating a
 224                                            reparse is needed.  After that pass,
 225                                            it is what 'npar' became after the
 226                                            pass.  Hence, it being > 0 indicates
 227                                            we are in a reparse situation */
 228     I32         nestroot;               /* root parens we are in - used by
 229                                            accept */
 230     I32         seen_zerolen;
 231     regnode     *end_op;                /* END node in program */
 232     I32         utf8;           /* whether the pattern is utf8 or not */
 233     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 234                                 /* XXX use this for future optimisation of case
 235                                  * where pattern must be upgraded to utf8. */
 236     I32         uni_semantics;  /* If a d charset modifier should use unicode
 237                                    rules, even if the pattern is not in
 238                                    utf8 */
 239
 240     I32         recurse_count;          /* Number of recurse regops we have generated */
 241     regnode     **recurse;              /* Recurse regops */
 242     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 243                                            through */
 244     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 245     I32         in_lookaround;
 246     I32         contains_locale;
 247     I32         override_recoding;
 248     I32         recode_x_to_native;
 249     I32         in_multi_char_class;
 250     int         code_index;             /* next code_blocks[] slot */
 251     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 252                                             within pattern */
 253     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 254     scan_frame *frame_head;
 255     scan_frame *frame_last;
 256     U32         frame_count;
 257     AV         *warn_text;
 258     HV         *unlexed_names;
 259     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 260 #ifdef DEBUGGING
 261     const char  *lastparse;
 262     I32         lastnum;
 263     U32         study_chunk_recursed_count;
 264     AV          *paren_name_list;       /* idx -> name */
 265     SV          *mysv1;
 266     SV          *mysv2;
 267
 268 #define RExC_lastparse  (pRExC_state->lastparse)
 269 #define RExC_lastnum    (pRExC_state->lastnum)
 270 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 271 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 272 #define RExC_mysv       (pRExC_state->mysv1)
 273 #define RExC_mysv1      (pRExC_state->mysv1)
 274 #define RExC_mysv2      (pRExC_state->mysv2)
 275
 276 #endif
 277     bool        seen_d_op;
 278     bool        strict;
 279     bool        study_started;
 280     bool        in_script_run;
 281     bool        use_BRANCHJ;
 282     bool        sWARN_EXPERIMENTAL__VLB;
 283     bool        sWARN_EXPERIMENTAL__REGEX_SETS;
 284 };
 285
 286 #define RExC_flags      (pRExC_state->flags)
 287 #define RExC_pm_flags   (pRExC_state->pm_flags)
 288 #define RExC_precomp    (pRExC_state->precomp)
 289 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 290 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 291 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
 292 #define RExC_precomp_end (pRExC_state->precomp_end)
 293 #define RExC_rx_sv      (pRExC_state->rx_sv)
 294 #define RExC_rx         (pRExC_state->rx)
 295 #define RExC_rxi        (pRExC_state->rxi)
 296 #define RExC_start      (pRExC_state->start)
 297 #define RExC_end        (pRExC_state->end)
 298 #define RExC_parse      (pRExC_state->parse)
 299 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 300 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 301 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 302                                                    under /d from /u ? */
 303
 304 #ifdef RE_TRACK_PATTERN_OFFSETS
 305 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 306                                                          others */
 307 #endif
 308 #define RExC_emit       (pRExC_state->emit)
 309 #define RExC_emit_start (pRExC_state->emit_start)
 310 #define RExC_sawback    (pRExC_state->sawback)
 311 #define RExC_seen       (pRExC_state->seen)
 312 #define RExC_size       (pRExC_state->size)
 313 #define RExC_maxlen        (pRExC_state->maxlen)
 314 #define RExC_npar       (pRExC_state->npar)
 315 #define RExC_total_parens       (pRExC_state->total_par)
 316 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
 317 #define RExC_nestroot   (pRExC_state->nestroot)
 318 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 319 #define RExC_utf8       (pRExC_state->utf8)
 320 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 321 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 322 #define RExC_open_parens        (pRExC_state->open_parens)
 323 #define RExC_close_parens       (pRExC_state->close_parens)
 324 #define RExC_end_op     (pRExC_state->end_op)
 325 #define RExC_paren_names        (pRExC_state->paren_names)
 326 #define RExC_recurse    (pRExC_state->recurse)
 327 #define RExC_recurse_count      (pRExC_state->recurse_count)
 328 #define RExC_sets_depth         (pRExC_state->sets_depth)
 329 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 330 #define RExC_study_chunk_recursed_bytes  \
 331                                    (pRExC_state->study_chunk_recursed_bytes)
 332 #define RExC_in_lookaround      (pRExC_state->in_lookaround)
 333 #define RExC_contains_locale    (pRExC_state->contains_locale)
 334 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 335
 336 #ifdef EBCDIC
 337 #  define SET_recode_x_to_native(x)                                         \
 338                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
 339 #else
 340 #  define SET_recode_x_to_native(x) NOOP
 341 #endif
 342
 343 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 344 #define RExC_frame_head (pRExC_state->frame_head)
 345 #define RExC_frame_last (pRExC_state->frame_last)
 346 #define RExC_frame_count (pRExC_state->frame_count)
 347 #define RExC_strict (pRExC_state->strict)
 348 #define RExC_study_started      (pRExC_state->study_started)
 349 #define RExC_warn_text (pRExC_state->warn_text)
 350 #define RExC_in_script_run      (pRExC_state->in_script_run)
 351 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 352 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
 353 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
 354 #define RExC_unlexed_names (pRExC_state->unlexed_names)
 355
 356 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 357  * a flag to disable back-off on the fixed/floating substrings - if it's
 358  * a high complexity pattern we assume the benefit of avoiding a full match
 359  * is worth the cost of checking for the substrings even if they rarely help.
 360  */
 361 #define RExC_naughty    (pRExC_state->naughty)
 362 #define TOO_NAUGHTY (10)
 363 #define MARK_NAUGHTY(add) \
 364     if (RExC_naughty < TOO_NAUGHTY) \
 365         RExC_naughty += (add)
 366 #define MARK_NAUGHTY_EXP(exp, add) \
 367     if (RExC_naughty < TOO_NAUGHTY) \
 368         RExC_naughty += RExC_naughty / (exp) + (add)
 369
 370 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 371 #define ISMULT2(s)      (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
 372
 373 /*
 374  * Flags to be passed up and down.
 375  */
 376 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 377                                    non-null ones. */
 378 #define SIMPLE          0x02    /* Exactly one character wide */
 379                                 /* (or LNBREAK as a special case) */
 380 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 381 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 382 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 383 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 384                                    calcuate sizes as UTF-8 */
 385
 386 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 387
 388 /* whether trie related optimizations are enabled */
 389 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 390 #define TRIE_STUDY_OPT
 391 #define FULL_TRIE_STUDY
 392 #define TRIE_STCLASS
 393 #endif
 394
 395
 396
 397 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 398 #define PBITVAL(paren) (1 << ((paren) & 7))
 399 #define PAREN_OFFSET(depth) \
 400     (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
 401 #define PAREN_TEST(depth, paren) \
 402     (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
 403 #define PAREN_SET(depth, paren) \
 404     (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
 405 #define PAREN_UNSET(depth, paren) \
 406     (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
 407
 408 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 409                                      if (!UTF) {                           \
 410                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 411                                          return 0;                         \
 412                                      }                                     \
 413                              } STMT_END
 414
 415 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
 416  * pattern is in UTF-8.  This latter condition is in case the outermost rules
 417  * are locale.  See GH #17278 */
 418 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
 419
 420 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 421  * a flag that indicates we need to override /d with /u as a result of
 422  * something in the pattern.  It should only be used in regards to calling
 423  * set_regex_charset() or get_regex_charset() */
 424 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 425     STMT_START {                                                            \
 426             if (DEPENDS_SEMANTICS) {                                        \
 427                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 428                 RExC_uni_semantics = 1;                                     \
 429                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
 430                     /* No need to restart the parse if we haven't seen      \
 431                      * anything that differs between /u and /d, and no need \
 432                      * to restart immediately if we're going to reparse     \
 433                      * anyway to count parens */                            \
 434                     *flagp |= RESTART_PARSE;                                \
 435                     return restart_retval;                                  \
 436                 }                                                           \
 437             }                                                               \
 438     } STMT_END
 439
 440 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 441     STMT_START {                                                            \
 442                 RExC_use_BRANCHJ = 1;                                       \
 443                 *flagp |= RESTART_PARSE;                                    \
 444                 return restart_retval;                                      \
 445     } STMT_END
 446
 447 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
 448  * less.  After that, it must always be positive, because the whole re is
 449  * considered to be surrounded by virtual parens.  Setting it to negative
 450  * indicates there is some construct that needs to know the actual number of
 451  * parens to be properly handled.  And that means an extra pass will be
 452  * required after we've counted them all */
 453 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
 454 #define REQUIRE_PARENS_PASS                                                 \
 455     STMT_START {  /* No-op if have completed a pass */                      \
 456                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
 457     } STMT_END
 458 #define IN_PARENS_PASS (RExC_total_parens < 0)
 459
 460
 461 /* This is used to return failure (zero) early from the calling function if
 462  * various flags in 'flags' are set.  Two flags always cause a return:
 463  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 464  * additional flags that should cause a return; 0 if none.  If the return will
 465  * be done, '*flagp' is first set to be all of the flags that caused the
 466  * return. */
 467 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 468     STMT_START {                                                            \
 469             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 470                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 471                 return 0;                                                   \
 472             }                                                               \
 473     } STMT_END
 474
 475 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 476
 477 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 478                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 479 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 480                                     if (MUST_RESTART(*(flagp))) return 0
 481
 482 /* This converts the named class defined in regcomp.h to its equivalent class
 483  * number defined in handy.h. */
 484 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 485 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 486
 487 #define _invlist_union_complement_2nd(a, b, output) \
 488                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 489 #define _invlist_intersection_complement_2nd(a, b, output) \
 490                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 491
 492 /* We add a marker if we are deferring expansion of a property that is both
 493  * 1) potentiallly user-defined; and
 494  * 2) could also be an official Unicode property.
 495  *
 496  * Without this marker, any deferred expansion can only be for a user-defined
 497  * one.  This marker shouldn't conflict with any that could be in a legal name,
 498  * and is appended to its name to indicate this.  There is a string and
 499  * character form */
 500 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
 501 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
 502
 503 /* What is infinity for optimization purposes */
 504 #define OPTIMIZE_INFTY  SSize_t_MAX
 505
 506 /* About scan_data_t.
 507
 508   During optimisation we recurse through the regexp program performing
 509   various inplace (keyhole style) optimisations. In addition study_chunk
 510   and scan_commit populate this data structure with information about
 511   what strings MUST appear in the pattern. We look for the longest
 512   string that must appear at a fixed location, and we look for the
 513   longest string that may appear at a floating location. So for instance
 514   in the pattern:
 515
 516     /FOO[xX]A.*B[xX]BAR/
 517
 518   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 519   strings (because they follow a .* construct). study_chunk will identify
 520   both FOO and BAR as being the longest fixed and floating strings respectively.
 521
 522   The strings can be composites, for instance
 523
 524      /(f)(o)(o)/
 525
 526   will result in a composite fixed substring 'foo'.
 527
 528   For each string some basic information is maintained:
 529
 530   - min_offset
 531     This is the position the string must appear at, or not before.
 532     It also implicitly (when combined with minlenp) tells us how many
 533     characters must match before the string we are searching for.
 534     Likewise when combined with minlenp and the length of the string it
 535     tells us how many characters must appear after the string we have
 536     found.
 537
 538   - max_offset
 539     Only used for floating strings. This is the rightmost point that
 540     the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
 541     string can occur infinitely far to the right.
 542     For fixed strings, it is equal to min_offset.
 543
 544   - minlenp
 545     A pointer to the minimum number of characters of the pattern that the
 546     string was found inside. This is important as in the case of positive
 547     lookahead or positive lookbehind we can have multiple patterns
 548     involved. Consider
 549
 550     /(?=FOO).*F/
 551
 552     The minimum length of the pattern overall is 3, the minimum length
 553     of the lookahead part is 3, but the minimum length of the part that
 554     will actually match is 1. So 'FOO's minimum length is 3, but the
 555     minimum length for the F is 1. This is important as the minimum length
 556     is used to determine offsets in front of and behind the string being
 557     looked for.  Since strings can be composites this is the length of the
 558     pattern at the time it was committed with a scan_commit. Note that
 559     the length is calculated by study_chunk, so that the minimum lengths
 560     are not known until the full pattern has been compiled, thus the
 561     pointer to the value.
 562
 563   - lookbehind
 564
 565     In the case of lookbehind the string being searched for can be
 566     offset past the start point of the final matching string.
 567     If this value was just blithely removed from the min_offset it would
 568     invalidate some of the calculations for how many chars must match
 569     before or after (as they are derived from min_offset and minlen and
 570     the length of the string being searched for).
 571     When the final pattern is compiled and the data is moved from the
 572     scan_data_t structure into the regexp structure the information
 573     about lookbehind is factored in, with the information that would
 574     have been lost precalculated in the end_shift field for the
 575     associated string.
 576
 577   The fields pos_min and pos_delta are used to store the minimum offset
 578   and the delta to the maximum offset at the current point in the pattern.
 579
 580 */
 581
 582 struct scan_data_substrs {
 583     SV      *str;       /* longest substring found in pattern */
 584     SSize_t min_offset; /* earliest point in string it can appear */
 585     SSize_t max_offset; /* latest point in string it can appear */
 586     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 587     SSize_t lookbehind; /* is the pos of the string modified by LB */
 588     I32 flags;          /* per substring SF_* and SCF_* flags */
 589 };
 590
 591 typedef struct scan_data_t {
 592     /*I32 len_min;      unused */
 593     /*I32 len_delta;    unused */
 594     SSize_t pos_min;
 595     SSize_t pos_delta;
 596     SV *last_found;
 597     SSize_t last_end;       /* min value, <0 unless valid. */
 598     SSize_t last_start_min;
 599     SSize_t last_start_max;
 600     U8      cur_is_floating; /* whether the last_* values should be set as
 601                               * the next fixed (0) or floating (1)
 602                               * substring */
 603
 604     /* [0] is longest fixed substring so far, [1] is longest float so far */
 605     struct scan_data_substrs  substrs[2];
 606
 607     I32 flags;             /* common SF_* and SCF_* flags */
 608     I32 whilem_c;
 609     SSize_t *last_closep;
 610     regnode_ssc *start_class;
 611 } scan_data_t;
 612
 613 /*
 614  * Forward declarations for pregcomp()'s friends.
 615  */
 616
 617 static const scan_data_t zero_scan_data = {
 618     0, 0, NULL, 0, 0, 0, 0,
 619     {
 620         { NULL, 0, 0, 0, 0, 0 },
 621         { NULL, 0, 0, 0, 0, 0 },
 622     },
 623     0, 0, NULL, NULL
 624 };
 625
 626 /* study flags */
 627
 628 #define SF_BEFORE_SEOL          0x0001
 629 #define SF_BEFORE_MEOL          0x0002
 630 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 631
 632 #define SF_IS_INF               0x0040
 633 #define SF_HAS_PAR              0x0080
 634 #define SF_IN_PAR               0x0100
 635 #define SF_HAS_EVAL             0x0200
 636
 637
 638 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 639  * longest substring in the pattern. When it is not set the optimiser keeps
 640  * track of position, but does not keep track of the actual strings seen,
 641  *
 642  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 643  * /foo/i will not.
 644  *
 645  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 646  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 647  * turned off because of the alternation (BRANCH). */
 648 #define SCF_DO_SUBSTR           0x0400
 649
 650 #define SCF_DO_STCLASS_AND      0x0800
 651 #define SCF_DO_STCLASS_OR       0x1000
 652 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 653 #define SCF_WHILEM_VISITED_POS  0x2000
 654
 655 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 656 #define SCF_SEEN_ACCEPT         0x8000
 657 #define SCF_TRIE_DOING_RESTUDY 0x10000
 658 #define SCF_IN_DEFINE          0x20000
 659
 660
 661
 662
 663 #define UTF cBOOL(RExC_utf8)
 664
 665 /* The enums for all these are ordered so things work out correctly */
 666 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 667 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 668                                                      == REGEX_DEPENDS_CHARSET)
 669 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 670 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 671                                                      >= REGEX_UNICODE_CHARSET)
 672 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 673                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 674 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 675                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 676 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 677                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 678
 679 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 680
 681 /* For programs that want to be strictly Unicode compatible by dying if any
 682  * attempt is made to match a non-Unicode code point against a Unicode
 683  * property.  */
 684 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 685
 686 #define OOB_NAMEDCLASS          -1
 687
 688 /* There is no code point that is out-of-bounds, so this is problematic.  But
 689  * its only current use is to initialize a variable that is always set before
 690  * looked at. */
 691 #define OOB_UNICODE             0xDEADBEEF
 692
 693 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 694
 695
 696 /* length of regex to show in messages that don't mark a position within */
 697 #define RegexLengthToShowInErrorMessages 127
 698
 699 /*
 700  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 701  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 702  * op/pragma/warn/regcomp.
 703  */
 704 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 705 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 706
 707 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 708                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 709
 710 /* The code in this file in places uses one level of recursion with parsing
 711  * rebased to an alternate string constructed by us in memory.  This can take
 712  * the form of something that is completely different from the input, or
 713  * something that uses the input as part of the alternate.  In the first case,
 714  * there should be no possibility of an error, as we are in complete control of
 715  * the alternate string.  But in the second case we don't completely control
 716  * the input portion, so there may be errors in that.  Here's an example:
 717  *      /[abc\x{DF}def]/ui
 718  * is handled specially because \x{df} folds to a sequence of more than one
 719  * character: 'ss'.  What is done is to create and parse an alternate string,
 720  * which looks like this:
 721  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 722  * where it uses the input unchanged in the middle of something it constructs,
 723  * which is a branch for the DF outside the character class, and clustering
 724  * parens around the whole thing. (It knows enough to skip the DF inside the
 725  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 726  * need to be reported.  The general situation looks like this:
 727  *
 728  *                                       |<------- identical ------>|
 729  *              sI                       tI               xI       eI
 730  * Input:       ---------------------------------------------------------------
 731  * Constructed:         ---------------------------------------------------
 732  *                      sC               tC               xC       eC     EC
 733  *                                       |<------- identical ------>|
 734  *
 735  * sI..eI   is the portion of the input pattern we are concerned with here.
 736  * sC..EC   is the constructed substitute parse string.
 737  *  sC..tC  is constructed by us
 738  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 739  *          In the diagram, these are vertically aligned.
 740  *  eC..EC  is also constructed by us.
 741  * xC       is the position in the substitute parse string where we found a
 742  *          problem.
 743  * xI       is the position in the original pattern corresponding to xC.
 744  *
 745  * We want to display a message showing the real input string.  Thus we need to
 746  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 747  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 748  * get:
 749  *      xI = tI + (xC - tC)
 750  *
 751  * When the substitute parse is constructed, the code needs to set:
 752  *      RExC_start (sC)
 753  *      RExC_end (eC)
 754  *      RExC_copy_start_in_input  (tI)
 755  *      RExC_copy_start_in_constructed (tC)
 756  * and restore them when done.
 757  *
 758  * During normal processing of the input pattern, both
 759  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 760  * sI, so that xC equals xI.
 761  */
 762
 763 #define sI              RExC_precomp
 764 #define eI              RExC_precomp_end
 765 #define sC              RExC_start
 766 #define eC              RExC_end
 767 #define tI              RExC_copy_start_in_input
 768 #define tC              RExC_copy_start_in_constructed
 769 #define xI(xC)          (tI + (xC - tC))
 770 #define xI_offset(xC)   (xI(xC) - sI)
 771
 772 #define REPORT_LOCATION_ARGS(xC)                                            \
 773     UTF8fARG(UTF,                                                           \
 774              (xI(xC) > eI) /* Don't run off end */                          \
 775               ? eI - sI   /* Length before the <--HERE */                   \
 776               : ((xI_offset(xC) >= 0)                                       \
 777                  ? xI_offset(xC)                                            \
 778                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 779                                     IVdf " trying to output message for "   \
 780                                     " pattern %.*s",                        \
 781                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 782                                     ((int) (eC - sC)), sC), 0)),            \
 783              sI),         /* The input pattern printed up to the <--HERE */ \
 784     UTF8fARG(UTF,                                                           \
 785              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 786              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 787
 788 /* Used to point after bad bytes for an error message, but avoid skipping
 789  * past a nul byte. */
 790 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 791
 792 /* Set up to clean up after our imminent demise */
 793 #define PREPARE_TO_DIE                                                      \
 794     STMT_START {                                                            \
 795         if (RExC_rx_sv)                                                     \
 796             SAVEFREESV(RExC_rx_sv);                                         \
 797         if (RExC_open_parens)                                               \
 798             SAVEFREEPV(RExC_open_parens);                                   \
 799         if (RExC_close_parens)                                              \
 800             SAVEFREEPV(RExC_close_parens);                                  \
 801     } STMT_END
 802
 803 /*
 804  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 805  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 806  * "...".
 807  */
 808 #define _FAIL(code) STMT_START {                                        \
 809     const char *ellipses = "";                                          \
 810     IV len = RExC_precomp_end - RExC_precomp;                           \
 811                                                                         \
 812     PREPARE_TO_DIE;                                                     \
 813     if (len > RegexLengthToShowInErrorMessages) {                       \
 814         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 815         len = RegexLengthToShowInErrorMessages - 10;                    \
 816         ellipses = "...";                                               \
 817     }                                                                   \
 818     code;                                                               \
 819 } STMT_END
 820
 821 #define FAIL(msg) _FAIL(                            \
 822     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 823             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 824
 825 #define FAIL2(msg,arg) _FAIL(                       \
 826     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 827             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 828
 829 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
 830     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 831      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 832
 833 /*
 834  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 835  */
 836 #define Simple_vFAIL(m) STMT_START {                                    \
 837     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 838             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 839 } STMT_END
 840
 841 /*
 842  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 843  */
 844 #define vFAIL(m) STMT_START {                           \
 845     PREPARE_TO_DIE;                                     \
 846     Simple_vFAIL(m);                                    \
 847 } STMT_END
 848
 849 /*
 850  * Like Simple_vFAIL(), but accepts two arguments.
 851  */
 852 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 853     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,                \
 854                       REPORT_LOCATION_ARGS(RExC_parse));        \
 855 } STMT_END
 856
 857 /*
 858  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 859  */
 860 #define vFAIL2(m,a1) STMT_START {                       \
 861     PREPARE_TO_DIE;                                     \
 862     Simple_vFAIL2(m, a1);                               \
 863 } STMT_END
 864
 865
 866 /*
 867  * Like Simple_vFAIL(), but accepts three arguments.
 868  */
 869 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 870     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,            \
 871             REPORT_LOCATION_ARGS(RExC_parse));                  \
 872 } STMT_END
 873
 874 /*
 875  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 876  */
 877 #define vFAIL3(m,a1,a2) STMT_START {                    \
 878     PREPARE_TO_DIE;                                     \
 879     Simple_vFAIL3(m, a1, a2);                           \
 880 } STMT_END
 881
 882 /*
 883  * Like Simple_vFAIL(), but accepts four arguments.
 884  */
 885 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 886     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3,        \
 887             REPORT_LOCATION_ARGS(RExC_parse));                  \
 888 } STMT_END
 889
 890 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 891     PREPARE_TO_DIE;                                     \
 892     Simple_vFAIL4(m, a1, a2, a3);                       \
 893 } STMT_END
 894
 895 /* A specialized version of vFAIL2 that works with UTF8f */
 896 #define vFAIL2utf8f(m, a1) STMT_START {             \
 897     PREPARE_TO_DIE;                                 \
 898     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1,  \
 899             REPORT_LOCATION_ARGS(RExC_parse));      \
 900 } STMT_END
 901
 902 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 903     PREPARE_TO_DIE;                                     \
 904     S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2,  \
 905             REPORT_LOCATION_ARGS(RExC_parse));          \
 906 } STMT_END
 907
 908 /* Setting this to NULL is a signal to not output warnings */
 909 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
 910     STMT_START {                                                            \
 911       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
 912       RExC_copy_start_in_constructed = NULL;                                \
 913     } STMT_END
 914 #define RESTORE_WARNINGS                                                    \
 915     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
 916
 917 /* Since a warning can be generated multiple times as the input is reparsed, we
 918  * output it the first time we come to that point in the parse, but suppress it
 919  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 920  * generate any warnings */
 921 #define TO_OUTPUT_WARNINGS(loc)                                         \
 922   (   RExC_copy_start_in_constructed                                    \
 923    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 924
 925 /* After we've emitted a warning, we save the position in the input so we don't
 926  * output it again */
 927 #define UPDATE_WARNINGS_LOC(loc)                                        \
 928     STMT_START {                                                        \
 929         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 930             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
 931                                                        - RExC_precomp;  \
 932         }                                                               \
 933     } STMT_END
 934
 935 /* 'warns' is the output of the packWARNx macro used in 'code' */
 936 #define _WARN_HELPER(loc, warns, code)                                  \
 937     STMT_START {                                                        \
 938         if (! RExC_copy_start_in_constructed) {                         \
 939             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 940                               " expected at '%s'",                      \
 941                               __FILE__, __LINE__, loc);                 \
 942         }                                                               \
 943         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 944             if (ckDEAD(warns))                                          \
 945                 PREPARE_TO_DIE;                                         \
 946             code;                                                       \
 947             UPDATE_WARNINGS_LOC(loc);                                   \
 948         }                                                               \
 949     } STMT_END
 950
 951 /* m is not necessarily a "literal string", in this macro */
 952 #define warn_non_literal_string(loc, packed_warn, m)                    \
 953     _WARN_HELPER(loc, packed_warn,                                      \
 954                       Perl_warner(aTHX_ packed_warn,                    \
 955                                        "%s" REPORT_LOCATION,            \
 956                                   m, REPORT_LOCATION_ARGS(loc)))
 957 #define reg_warn_non_literal_string(loc, m)                             \
 958                 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
 959
 960 #define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
 961     STMT_START {                                                            \
 962                 char * format;                                              \
 963                 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
 964                 Newx(format, format_size, char);                            \
 965                 my_strlcpy(format, m, format_size);                         \
 966                 my_strlcat(format, REPORT_LOCATION, format_size);           \
 967                 SAVEFREEPV(format);                                         \
 968                 _WARN_HELPER(loc, packwarn,                                 \
 969                       Perl_ck_warner(aTHX_ packwarn,                        \
 970                                         format,                             \
 971                                         a1, REPORT_LOCATION_ARGS(loc)));    \
 972     } STMT_END
 973
 974 #define ckWARNreg(loc,m)                                                \
 975     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 976                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 977                                           m REPORT_LOCATION,            \
 978                                           REPORT_LOCATION_ARGS(loc)))
 979
 980 #define vWARN(loc, m)                                                   \
 981     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 982                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 983                                        m REPORT_LOCATION,               \
 984                                        REPORT_LOCATION_ARGS(loc)))      \
 985
 986 #define vWARN_dep(loc, m)                                               \
 987     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 988                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 989                                        m REPORT_LOCATION,               \
 990                                        REPORT_LOCATION_ARGS(loc)))
 991
 992 #define ckWARNdep(loc,m)                                                \
 993     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 994                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 995                                             m REPORT_LOCATION,          \
 996                                             REPORT_LOCATION_ARGS(loc)))
 997
 998 #define ckWARNregdep(loc,m)                                                 \
 999     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
1000                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
1001                                                       WARN_REGEXP),         \
1002                                              m REPORT_LOCATION,             \
1003                                              REPORT_LOCATION_ARGS(loc)))
1004
1005 #define ckWARN2reg_d(loc,m, a1)                                             \
1006     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1007                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
1008                                             m REPORT_LOCATION,              \
1009                                             a1, REPORT_LOCATION_ARGS(loc)))
1010
1011 #define ckWARN2reg(loc, m, a1)                                              \
1012     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1013                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1014                                           m REPORT_LOCATION,                \
1015                                           a1, REPORT_LOCATION_ARGS(loc)))
1016
1017 #define vWARN3(loc, m, a1, a2)                                              \
1018     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1019                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
1020                                        m REPORT_LOCATION,                   \
1021                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
1022
1023 #define ckWARN3reg(loc, m, a1, a2)                                          \
1024     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
1025                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
1026                                           m REPORT_LOCATION,                \
1027                                           a1, a2,                           \
1028                                           REPORT_LOCATION_ARGS(loc)))
1029
1030 #define vWARN4(loc, m, a1, a2, a3)                                      \
1031     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1032                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1033                                        m REPORT_LOCATION,               \
1034                                        a1, a2, a3,                      \
1035                                        REPORT_LOCATION_ARGS(loc)))
1036
1037 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
1038     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1039                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
1040                                           m REPORT_LOCATION,            \
1041                                           a1, a2, a3,                   \
1042                                           REPORT_LOCATION_ARGS(loc)))
1043
1044 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
1045     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
1046                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
1047                                        m REPORT_LOCATION,               \
1048                                        a1, a2, a3, a4,                  \
1049                                        REPORT_LOCATION_ARGS(loc)))
1050
1051 #define ckWARNexperimental(loc, class, m)                               \
1052     STMT_START {                                                        \
1053         if (! RExC_warned_ ## class) { /* warn once per compilation */  \
1054             RExC_warned_ ## class = 1;                                  \
1055             _WARN_HELPER(loc, packWARN(class),                          \
1056                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
1057                                             m REPORT_LOCATION,          \
1058                                             REPORT_LOCATION_ARGS(loc)));\
1059         }                                                               \
1060     } STMT_END
1061
1062 /* Convert between a pointer to a node and its offset from the beginning of the
1063  * program */
1064 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
1065 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1066
1067 /* Macros for recording node offsets.   20001227 mjd@plover.com
1068  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
1069  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
1070  * Element 0 holds the number n.
1071  * Position is 1 indexed.
1072  */
1073 #ifndef RE_TRACK_PATTERN_OFFSETS
1074 #define Set_Node_Offset_To_R(offset,byte)
1075 #define Set_Node_Offset(node,byte)
1076 #define Set_Cur_Node_Offset
1077 #define Set_Node_Length_To_R(node,len)
1078 #define Set_Node_Length(node,len)
1079 #define Set_Node_Cur_Length(node,start)
1080 #define Node_Offset(n)
1081 #define Node_Length(n)
1082 #define Set_Node_Offset_Length(node,offset,len)
1083 #define ProgLen(ri) ri->u.proglen
1084 #define SetProgLen(ri,x) ri->u.proglen = x
1085 #define Track_Code(code)
1086 #else
1087 #define ProgLen(ri) ri->u.offsets[0]
1088 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1089 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
1090         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
1091                     __LINE__, (int)(offset), (int)(byte)));             \
1092         if((offset) < 0) {                                              \
1093             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
1094                                          (int)(offset));                \
1095         } else {                                                        \
1096             RExC_offsets[2*(offset)-1] = (byte);                        \
1097         }                                                               \
1098 } STMT_END
1099
1100 #define Set_Node_Offset(node,byte)                                      \
1101     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1102 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1103
1104 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
1105         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
1106                 __LINE__, (int)(node), (int)(len)));                    \
1107         if((node) < 0) {                                                \
1108             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1109                                          (int)(node));                  \
1110         } else {                                                        \
1111             RExC_offsets[2*(node)] = (len);                             \
1112         }                                                               \
1113 } STMT_END
1114
1115 #define Set_Node_Length(node,len) \
1116     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1117 #define Set_Node_Cur_Length(node, start)                \
1118     Set_Node_Length(node, RExC_parse - start)
1119
1120 /* Get offsets and lengths */
1121 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1122 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1123
1124 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1125     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1126     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1127 } STMT_END
1128
1129 #define Track_Code(code) STMT_START { code } STMT_END
1130 #endif
1131
1132 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1133 #define EXPERIMENTAL_INPLACESCAN
1134 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1135
1136 #ifdef DEBUGGING
1137 int
1138 Perl_re_printf(pTHX_ const char *fmt, ...)
1139 {
1140     va_list ap;
1141     int result;
1142     PerlIO *f= Perl_debug_log;
1143     PERL_ARGS_ASSERT_RE_PRINTF;
1144     va_start(ap, fmt);
1145     result = PerlIO_vprintf(f, fmt, ap);
1146     va_end(ap);
1147     return result;
1148 }
1149
1150 int
1151 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1152 {
1153     va_list ap;
1154     int result;
1155     PerlIO *f= Perl_debug_log;
1156     PERL_ARGS_ASSERT_RE_INDENTF;
1157     va_start(ap, depth);
1158     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1159     result = PerlIO_vprintf(f, fmt, ap);
1160     va_end(ap);
1161     return result;
1162 }
1163 #endif /* DEBUGGING */
1164
1165 #define DEBUG_RExC_seen()                                                   \
1166         DEBUG_OPTIMISE_MORE_r({                                             \
1167             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1168                                                                             \
1169             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1170                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1171                                                                             \
1172             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1173                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1174                                                                             \
1175             if (RExC_seen & REG_GPOS_SEEN)                                  \
1176                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1177                                                                             \
1178             if (RExC_seen & REG_RECURSE_SEEN)                               \
1179                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1180                                                                             \
1181             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1182                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1183                                                                             \
1184             if (RExC_seen & REG_VERBARG_SEEN)                               \
1185                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1186                                                                             \
1187             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1188                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1189                                                                             \
1190             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1191                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1192                                                                             \
1193             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1194                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1195                                                                             \
1196             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1197                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1198                                                                             \
1199             Perl_re_printf( aTHX_ "\n");                                    \
1200         });
1201
1202 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1203   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1204
1205
1206 #ifdef DEBUGGING
1207 static void
1208 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1209                                     const char *close_str)
1210 {
1211     if (!flags)
1212         return;
1213
1214     Perl_re_printf( aTHX_  "%s", open_str);
1215     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1216     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1217     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1218     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1219     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1220     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1221     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1222     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1223     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1224     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1225     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1226     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1227     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1228     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1229     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1230     Perl_re_printf( aTHX_  "%s", close_str);
1231 }
1232
1233
1234 static void
1235 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1236                     U32 depth, int is_inf)
1237 {
1238     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1239
1240     DEBUG_OPTIMISE_MORE_r({
1241         if (!data)
1242             return;
1243         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1244             depth,
1245             where,
1246             (IV)data->pos_min,
1247             (IV)data->pos_delta,
1248             (UV)data->flags
1249         );
1250
1251         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1252
1253         Perl_re_printf( aTHX_
1254             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1255             (IV)data->whilem_c,
1256             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1257             is_inf ? "INF " : ""
1258         );
1259
1260         if (data->last_found) {
1261             int i;
1262             Perl_re_printf(aTHX_
1263                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1264                     SvPVX_const(data->last_found),
1265                     (IV)data->last_end,
1266                     (IV)data->last_start_min,
1267                     (IV)data->last_start_max
1268             );
1269
1270             for (i = 0; i < 2; i++) {
1271                 Perl_re_printf(aTHX_
1272                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1273                     data->cur_is_floating == i ? "*" : "",
1274                     i ? "Float" : "Fixed",
1275                     SvPVX_const(data->substrs[i].str),
1276                     (IV)data->substrs[i].min_offset,
1277                     (IV)data->substrs[i].max_offset
1278                 );
1279                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1280             }
1281         }
1282
1283         Perl_re_printf( aTHX_ "\n");
1284     });
1285 }
1286
1287
1288 static void
1289 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1290                 regnode *scan, U32 depth, U32 flags)
1291 {
1292     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1293
1294     DEBUG_OPTIMISE_r({
1295         regnode *Next;
1296
1297         if (!scan)
1298             return;
1299         Next = regnext(scan);
1300         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1301         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1302             depth,
1303             str,
1304             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1305             Next ? (REG_NODE_NUM(Next)) : 0 );
1306         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1307         Perl_re_printf( aTHX_  "\n");
1308    });
1309 }
1310
1311
1312 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1313                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1314
1315 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1316                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1317
1318 #else
1319 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1320 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1321 #endif
1322
1323
1324 /* =========================================================
1325  * BEGIN edit_distance stuff.
1326  *
1327  * This calculates how many single character changes of any type are needed to
1328  * transform a string into another one.  It is taken from version 3.1 of
1329  *
1330  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1331  */
1332
1333 /* Our unsorted dictionary linked list.   */
1334 /* Note we use UVs, not chars. */
1335
1336 struct dictionary{
1337   UV key;
1338   UV value;
1339   struct dictionary* next;
1340 };
1341 typedef struct dictionary item;
1342
1343
1344 PERL_STATIC_INLINE item*
1345 push(UV key, item* curr)
1346 {
1347     item* head;
1348     Newx(head, 1, item);
1349     head->key = key;
1350     head->value = 0;
1351     head->next = curr;
1352     return head;
1353 }
1354
1355
1356 PERL_STATIC_INLINE item*
1357 find(item* head, UV key)
1358 {
1359     item* iterator = head;
1360     while (iterator){
1361         if (iterator->key == key){
1362             return iterator;
1363         }
1364         iterator = iterator->next;
1365     }
1366
1367     return NULL;
1368 }
1369
1370 PERL_STATIC_INLINE item*
1371 uniquePush(item* head, UV key)
1372 {
1373     item* iterator = head;
1374
1375     while (iterator){
1376         if (iterator->key == key) {
1377             return head;
1378         }
1379         iterator = iterator->next;
1380     }
1381
1382     return push(key, head);
1383 }
1384
1385 PERL_STATIC_INLINE void
1386 dict_free(item* head)
1387 {
1388     item* iterator = head;
1389
1390     while (iterator) {
1391         item* temp = iterator;
1392         iterator = iterator->next;
1393         Safefree(temp);
1394     }
1395
1396     head = NULL;
1397 }
1398
1399 /* End of Dictionary Stuff */
1400
1401 /* All calculations/work are done here */
1402 STATIC int
1403 S_edit_distance(const UV* src,
1404                 const UV* tgt,
1405                 const STRLEN x,             /* length of src[] */
1406                 const STRLEN y,             /* length of tgt[] */
1407                 const SSize_t maxDistance
1408 )
1409 {
1410     item *head = NULL;
1411     UV swapCount, swapScore, targetCharCount, i, j;
1412     UV *scores;
1413     UV score_ceil = x + y;
1414
1415     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1416
1417     /* intialize matrix start values */
1418     Newx(scores, ( (x + 2) * (y + 2)), UV);
1419     scores[0] = score_ceil;
1420     scores[1 * (y + 2) + 0] = score_ceil;
1421     scores[0 * (y + 2) + 1] = score_ceil;
1422     scores[1 * (y + 2) + 1] = 0;
1423     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1424
1425     /* work loops    */
1426     /* i = src index */
1427     /* j = tgt index */
1428     for (i=1;i<=x;i++) {
1429         if (i < x)
1430             head = uniquePush(head, src[i]);
1431         scores[(i+1) * (y + 2) + 1] = i;
1432         scores[(i+1) * (y + 2) + 0] = score_ceil;
1433         swapCount = 0;
1434
1435         for (j=1;j<=y;j++) {
1436             if (i == 1) {
1437                 if(j < y)
1438                 head = uniquePush(head, tgt[j]);
1439                 scores[1 * (y + 2) + (j + 1)] = j;
1440                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1441             }
1442
1443             targetCharCount = find(head, tgt[j-1])->value;
1444             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1445
1446             if (src[i-1] != tgt[j-1]){
1447                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1448             }
1449             else {
1450                 swapCount = j;
1451                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1452             }
1453         }
1454
1455         find(head, src[i-1])->value = i;
1456     }
1457
1458     {
1459         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1460         dict_free(head);
1461         Safefree(scores);
1462         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1463     }
1464 }
1465
1466 /* END of edit_distance() stuff
1467  * ========================================================= */
1468
1469 /* Mark that we cannot extend a found fixed substring at this point.
1470    Update the longest found anchored substring or the longest found
1471    floating substrings if needed. */
1472
1473 STATIC void
1474 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1475                     SSize_t *minlenp, int is_inf)
1476 {
1477     const STRLEN l = CHR_SVLEN(data->last_found);
1478     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1479     const STRLEN old_l = CHR_SVLEN(longest_sv);
1480     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1481
1482     PERL_ARGS_ASSERT_SCAN_COMMIT;
1483
1484     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1485         const U8 i = data->cur_is_floating;
1486         SvSetMagicSV(longest_sv, data->last_found);
1487         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1488
1489         if (!i) /* fixed */
1490             data->substrs[0].max_offset = data->substrs[0].min_offset;
1491         else { /* float */
1492             data->substrs[1].max_offset =
1493                       (is_inf)
1494                        ? OPTIMIZE_INFTY
1495                        : (l
1496                           ? data->last_start_max
1497                           /* temporary underflow guard for 5.32 */
1498                           : data->pos_delta < 0 ? OPTIMIZE_INFTY
1499                           : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1500                                          ? OPTIMIZE_INFTY
1501                                          : data->pos_min + data->pos_delta));
1502         }
1503
1504         data->substrs[i].flags &= ~SF_BEFORE_EOL;
1505         data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1506         data->substrs[i].minlenp = minlenp;
1507         data->substrs[i].lookbehind = 0;
1508     }
1509
1510     SvCUR_set(data->last_found, 0);
1511     {
1512         SV * const sv = data->last_found;
1513         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1514             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1515             if (mg)
1516                 mg->mg_len = 0;
1517         }
1518     }
1519     data->last_end = -1;
1520     data->flags &= ~SF_BEFORE_EOL;
1521     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1522 }
1523
1524 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1525  * list that describes which code points it matches */
1526
1527 STATIC void
1528 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1529 {
1530     /* Set the SSC 'ssc' to match an empty string or any code point */
1531
1532     PERL_ARGS_ASSERT_SSC_ANYTHING;
1533
1534     assert(is_ANYOF_SYNTHETIC(ssc));
1535
1536     /* mortalize so won't leak */
1537     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1538     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1539 }
1540
1541 STATIC int
1542 S_ssc_is_anything(const regnode_ssc *ssc)
1543 {
1544     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1545      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1546      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1547      * in any way, so there's no point in using it */
1548
1549     UV start, end;
1550     bool ret;
1551
1552     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1553
1554     assert(is_ANYOF_SYNTHETIC(ssc));
1555
1556     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1557         return FALSE;
1558     }
1559
1560     /* See if the list consists solely of the range 0 - Infinity */
1561     invlist_iterinit(ssc->invlist);
1562     ret = invlist_iternext(ssc->invlist, &start, &end)
1563           && start == 0
1564           && end == UV_MAX;
1565
1566     invlist_iterfinish(ssc->invlist);
1567
1568     if (ret) {
1569         return TRUE;
1570     }
1571
1572     /* If e.g., both \w and \W are set, matches everything */
1573     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1574         int i;
1575         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1576             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1577                 return TRUE;
1578             }
1579         }
1580     }
1581
1582     return FALSE;
1583 }
1584
1585 STATIC void
1586 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1587 {
1588     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1589      * string, any code point, or any posix class under locale */
1590
1591     PERL_ARGS_ASSERT_SSC_INIT;
1592
1593     Zero(ssc, 1, regnode_ssc);
1594     set_ANYOF_SYNTHETIC(ssc);
1595     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1596     ssc_anything(ssc);
1597
1598     /* If any portion of the regex is to operate under locale rules that aren't
1599      * fully known at compile time, initialization includes it.  The reason
1600      * this isn't done for all regexes is that the optimizer was written under
1601      * the assumption that locale was all-or-nothing.  Given the complexity and
1602      * lack of documentation in the optimizer, and that there are inadequate
1603      * test cases for locale, many parts of it may not work properly, it is
1604      * safest to avoid locale unless necessary. */
1605     if (RExC_contains_locale) {
1606         ANYOF_POSIXL_SETALL(ssc);
1607     }
1608     else {
1609         ANYOF_POSIXL_ZERO(ssc);
1610     }
1611 }
1612
1613 STATIC int
1614 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1615                         const regnode_ssc *ssc)
1616 {
1617     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1618      * to the list of code points matched, and locale posix classes; hence does
1619      * not check its flags) */
1620
1621     UV start, end;
1622     bool ret;
1623
1624     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1625
1626     assert(is_ANYOF_SYNTHETIC(ssc));
1627
1628     invlist_iterinit(ssc->invlist);
1629     ret = invlist_iternext(ssc->invlist, &start, &end)
1630           && start == 0
1631           && end == UV_MAX;
1632
1633     invlist_iterfinish(ssc->invlist);
1634
1635     if (! ret) {
1636         return FALSE;
1637     }
1638
1639     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1640         return FALSE;
1641     }
1642
1643     return TRUE;
1644 }
1645
1646 #define INVLIST_INDEX 0
1647 #define ONLY_LOCALE_MATCHES_INDEX 1
1648 #define DEFERRED_USER_DEFINED_INDEX 2
1649
1650 STATIC SV*
1651 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1652                                const regnode_charclass* const node)
1653 {
1654     /* Returns a mortal inversion list defining which code points are matched
1655      * by 'node', which is of type ANYOF.  Handles complementing the result if
1656      * appropriate.  If some code points aren't knowable at this time, the
1657      * returned list must, and will, contain every code point that is a
1658      * possibility. */
1659
1660     SV* invlist = NULL;
1661     SV* only_utf8_locale_invlist = NULL;
1662     unsigned int i;
1663     const U32 n = ARG(node);
1664     bool new_node_has_latin1 = FALSE;
1665     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1666                       ? 0
1667                       : ANYOF_FLAGS(node);
1668
1669     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1670
1671     /* Look at the data structure created by S_set_ANYOF_arg() */
1672     if (n != ANYOF_ONLY_HAS_BITMAP) {
1673         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1674         AV * const av = MUTABLE_AV(SvRV(rv));
1675         SV **const ary = AvARRAY(av);
1676         assert(RExC_rxi->data->what[n] == 's');
1677
1678         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1679
1680             /* Here there are things that won't be known until runtime -- we
1681              * have to assume it could be anything */
1682             invlist = sv_2mortal(_new_invlist(1));
1683             return _add_range_to_invlist(invlist, 0, UV_MAX);
1684         }
1685         else if (ary[INVLIST_INDEX]) {
1686
1687             /* Use the node's inversion list */
1688             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1689         }
1690
1691         /* Get the code points valid only under UTF-8 locales */
1692         if (   (flags & ANYOFL_FOLD)
1693             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1694         {
1695             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1696         }
1697     }
1698
1699     if (! invlist) {
1700         invlist = sv_2mortal(_new_invlist(0));
1701     }
1702
1703     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1704      * code points, and an inversion list for the others, but if there are code
1705      * points that should match only conditionally on the target string being
1706      * UTF-8, those are placed in the inversion list, and not the bitmap.
1707      * Since there are circumstances under which they could match, they are
1708      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1709      * to exclude them here, so that when we invert below, the end result
1710      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1711      * have to do this here before we add the unconditionally matched code
1712      * points */
1713     if (flags & ANYOF_INVERT) {
1714         _invlist_intersection_complement_2nd(invlist,
1715                                              PL_UpperLatin1,
1716                                              &invlist);
1717     }
1718
1719     /* Add in the points from the bit map */
1720     if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1721         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1722             if (ANYOF_BITMAP_TEST(node, i)) {
1723                 unsigned int start = i++;
1724
1725                 for (;    i < NUM_ANYOF_CODE_POINTS
1726                        && ANYOF_BITMAP_TEST(node, i); ++i)
1727                 {
1728                     /* empty */
1729                 }
1730                 invlist = _add_range_to_invlist(invlist, start, i-1);
1731                 new_node_has_latin1 = TRUE;
1732             }
1733         }
1734     }
1735
1736     /* If this can match all upper Latin1 code points, have to add them
1737      * as well.  But don't add them if inverting, as when that gets done below,
1738      * it would exclude all these characters, including the ones it shouldn't
1739      * that were added just above */
1740     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1741         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1742     {
1743         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1744     }
1745
1746     /* Similarly for these */
1747     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1748         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1749     }
1750
1751     if (flags & ANYOF_INVERT) {
1752         _invlist_invert(invlist);
1753     }
1754     else if (flags & ANYOFL_FOLD) {
1755         if (new_node_has_latin1) {
1756
1757             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1758              * the locale.  We can skip this if there are no 0-255 at all. */
1759             _invlist_union(invlist, PL_Latin1, &invlist);
1760
1761             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1762             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1763         }
1764         else {
1765             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1766                 invlist = add_cp_to_invlist(invlist, 'I');
1767             }
1768             if (_invlist_contains_cp(invlist,
1769                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1770             {
1771                 invlist = add_cp_to_invlist(invlist, 'i');
1772             }
1773         }
1774     }
1775
1776     /* Similarly add the UTF-8 locale possible matches.  These have to be
1777      * deferred until after the non-UTF-8 locale ones are taken care of just
1778      * above, or it leads to wrong results under ANYOF_INVERT */
1779     if (only_utf8_locale_invlist) {
1780         _invlist_union_maybe_complement_2nd(invlist,
1781                                             only_utf8_locale_invlist,
1782                                             flags & ANYOF_INVERT,
1783                                             &invlist);
1784     }
1785
1786     return invlist;
1787 }
1788
1789 /* These two functions currently do the exact same thing */
1790 #define ssc_init_zero           ssc_init
1791
1792 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1793 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1794
1795 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1796  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1797  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1798
1799 STATIC void
1800 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1801                 const regnode_charclass *and_with)
1802 {
1803     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1804      * another SSC or a regular ANYOF class.  Can create false positives. */
1805
1806     SV* anded_cp_list;
1807     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1808                           ? 0
1809                           : ANYOF_FLAGS(and_with);
1810     U8  anded_flags;
1811
1812     PERL_ARGS_ASSERT_SSC_AND;
1813
1814     assert(is_ANYOF_SYNTHETIC(ssc));
1815
1816     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1817      * the code point inversion list and just the relevant flags */
1818     if (is_ANYOF_SYNTHETIC(and_with)) {
1819         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1820         anded_flags = and_with_flags;
1821
1822         /* XXX This is a kludge around what appears to be deficiencies in the
1823          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1824          * there are paths through the optimizer where it doesn't get weeded
1825          * out when it should.  And if we don't make some extra provision for
1826          * it like the code just below, it doesn't get added when it should.
1827          * This solution is to add it only when AND'ing, which is here, and
1828          * only when what is being AND'ed is the pristine, original node
1829          * matching anything.  Thus it is like adding it to ssc_anything() but
1830          * only when the result is to be AND'ed.  Probably the same solution
1831          * could be adopted for the same problem we have with /l matching,
1832          * which is solved differently in S_ssc_init(), and that would lead to
1833          * fewer false positives than that solution has.  But if this solution
1834          * creates bugs, the consequences are only that a warning isn't raised
1835          * that should be; while the consequences for having /l bugs is
1836          * incorrect matches */
1837         if (ssc_is_anything((regnode_ssc *)and_with)) {
1838             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1839         }
1840     }
1841     else {
1842         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1843         if (OP(and_with) == ANYOFD) {
1844             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1845         }
1846         else {
1847             anded_flags = and_with_flags
1848             &( ANYOF_COMMON_FLAGS
1849               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1850               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1851             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1852                 anded_flags &=
1853                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1854             }
1855         }
1856     }
1857
1858     ANYOF_FLAGS(ssc) &= anded_flags;
1859
1860     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1861      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1862      * 'and_with' may be inverted.  When not inverted, we have the situation of
1863      * computing:
1864      *  (C1 | P1) & (C2 | P2)
1865      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1866      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1867      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1868      *                    <=  ((C1 & C2) | P1 | P2)
1869      * Alternatively, the last few steps could be:
1870      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1871      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1872      *                    <=  (C1 | C2 | (P1 & P2))
1873      * We favor the second approach if either P1 or P2 is non-empty.  This is
1874      * because these components are a barrier to doing optimizations, as what
1875      * they match cannot be known until the moment of matching as they are
1876      * dependent on the current locale, 'AND"ing them likely will reduce or
1877      * eliminate them.
1878      * But we can do better if we know that C1,P1 are in their initial state (a
1879      * frequent occurrence), each matching everything:
1880      *  (<everything>) & (C2 | P2) =  C2 | P2
1881      * Similarly, if C2,P2 are in their initial state (again a frequent
1882      * occurrence), the result is a no-op
1883      *  (C1 | P1) & (<everything>) =  C1 | P1
1884      *
1885      * Inverted, we have
1886      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1887      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1888      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1889      * */
1890
1891     if ((and_with_flags & ANYOF_INVERT)
1892         && ! is_ANYOF_SYNTHETIC(and_with))
1893     {
1894         unsigned int i;
1895
1896         ssc_intersection(ssc,
1897                          anded_cp_list,
1898                          FALSE /* Has already been inverted */
1899                          );
1900
1901         /* If either P1 or P2 is empty, the intersection will be also; can skip
1902          * the loop */
1903         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1904             ANYOF_POSIXL_ZERO(ssc);
1905         }
1906         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1907
1908             /* Note that the Posix class component P from 'and_with' actually
1909              * looks like:
1910              *      P = Pa | Pb | ... | Pn
1911              * where each component is one posix class, such as in [\w\s].
1912              * Thus
1913              *      ~P = ~(Pa | Pb | ... | Pn)
1914              *         = ~Pa & ~Pb & ... & ~Pn
1915              *        <= ~Pa | ~Pb | ... | ~Pn
1916              * The last is something we can easily calculate, but unfortunately
1917              * is likely to have many false positives.  We could do better
1918              * in some (but certainly not all) instances if two classes in
1919              * P have known relationships.  For example
1920              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1921              * So
1922              *      :lower: & :print: = :lower:
1923              * And similarly for classes that must be disjoint.  For example,
1924              * since \s and \w can have no elements in common based on rules in
1925              * the POSIX standard,
1926              *      \w & ^\S = nothing
1927              * Unfortunately, some vendor locales do not meet the Posix
1928              * standard, in particular almost everything by Microsoft.
1929              * The loop below just changes e.g., \w into \W and vice versa */
1930
1931             regnode_charclass_posixl temp;
1932             int add = 1;    /* To calculate the index of the complement */
1933
1934             Zero(&temp, 1, regnode_charclass_posixl);
1935             ANYOF_POSIXL_ZERO(&temp);
1936             for (i = 0; i < ANYOF_MAX; i++) {
1937                 assert(i % 2 != 0
1938                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1939                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1940
1941                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1942                     ANYOF_POSIXL_SET(&temp, i + add);
1943                 }
1944                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1945             }
1946             ANYOF_POSIXL_AND(&temp, ssc);
1947
1948         } /* else ssc already has no posixes */
1949     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1950          in its initial state */
1951     else if (! is_ANYOF_SYNTHETIC(and_with)
1952              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1953     {
1954         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1955          * copy it over 'ssc' */
1956         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1957             if (is_ANYOF_SYNTHETIC(and_with)) {
1958                 StructCopy(and_with, ssc, regnode_ssc);
1959             }
1960             else {
1961                 ssc->invlist = anded_cp_list;
1962                 ANYOF_POSIXL_ZERO(ssc);
1963                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1964                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1965                 }
1966             }
1967         }
1968         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1969                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1970         {
1971             /* One or the other of P1, P2 is non-empty. */
1972             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1973                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1974             }
1975             ssc_union(ssc, anded_cp_list, FALSE);
1976         }
1977         else { /* P1 = P2 = empty */
1978             ssc_intersection(ssc, anded_cp_list, FALSE);
1979         }
1980     }
1981 }
1982
1983 STATIC void
1984 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1985                const regnode_charclass *or_with)
1986 {
1987     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1988      * another SSC or a regular ANYOF class.  Can create false positives if
1989      * 'or_with' is to be inverted. */
1990
1991     SV* ored_cp_list;
1992     U8 ored_flags;
1993     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1994                          ? 0
1995                          : ANYOF_FLAGS(or_with);
1996
1997     PERL_ARGS_ASSERT_SSC_OR;
1998
1999     assert(is_ANYOF_SYNTHETIC(ssc));
2000
2001     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2002      * the code point inversion list and just the relevant flags */
2003     if (is_ANYOF_SYNTHETIC(or_with)) {
2004         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2005         ored_flags = or_with_flags;
2006     }
2007     else {
2008         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2009         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2010         if (OP(or_with) != ANYOFD) {
2011             ored_flags
2012             |= or_with_flags
2013              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2014                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2015             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2016                 ored_flags |=
2017                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2018             }
2019         }
2020     }
2021
2022     ANYOF_FLAGS(ssc) |= ored_flags;
2023
2024     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2025      * C2 is the list of code points in 'or-with'; P2, its posix classes.
2026      * 'or_with' may be inverted.  When not inverted, we have the simple
2027      * situation of computing:
2028      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
2029      * If P1|P2 yields a situation with both a class and its complement are
2030      * set, like having both \w and \W, this matches all code points, and we
2031      * can delete these from the P component of the ssc going forward.  XXX We
2032      * might be able to delete all the P components, but I (khw) am not certain
2033      * about this, and it is better to be safe.
2034      *
2035      * Inverted, we have
2036      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
2037      *                         <=  (C1 | P1) | ~C2
2038      *                         <=  (C1 | ~C2) | P1
2039      * (which results in actually simpler code than the non-inverted case)
2040      * */
2041
2042     if ((or_with_flags & ANYOF_INVERT)
2043         && ! is_ANYOF_SYNTHETIC(or_with))
2044     {
2045         /* We ignore P2, leaving P1 going forward */
2046     }   /* else  Not inverted */
2047     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2048         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2049         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2050             unsigned int i;
2051             for (i = 0; i < ANYOF_MAX; i += 2) {
2052                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2053                 {
2054                     ssc_match_all_cp(ssc);
2055                     ANYOF_POSIXL_CLEAR(ssc, i);
2056                     ANYOF_POSIXL_CLEAR(ssc, i+1);
2057                 }
2058             }
2059         }
2060     }
2061
2062     ssc_union(ssc,
2063               ored_cp_list,
2064               FALSE /* Already has been inverted */
2065               );
2066 }
2067
2068 STATIC void
2069 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2070 {
2071     PERL_ARGS_ASSERT_SSC_UNION;
2072
2073     assert(is_ANYOF_SYNTHETIC(ssc));
2074
2075     _invlist_union_maybe_complement_2nd(ssc->invlist,
2076                                         invlist,
2077                                         invert2nd,
2078                                         &ssc->invlist);
2079 }
2080
2081 STATIC void
2082 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2083                          SV* const invlist,
2084                          const bool invert2nd)
2085 {
2086     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2087
2088     assert(is_ANYOF_SYNTHETIC(ssc));
2089
2090     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2091                                                invlist,
2092                                                invert2nd,
2093                                                &ssc->invlist);
2094 }
2095
2096 STATIC void
2097 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2098 {
2099     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2100
2101     assert(is_ANYOF_SYNTHETIC(ssc));
2102
2103     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2104 }
2105
2106 STATIC void
2107 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2108 {
2109     /* AND just the single code point 'cp' into the SSC 'ssc' */
2110
2111     SV* cp_list = _new_invlist(2);
2112
2113     PERL_ARGS_ASSERT_SSC_CP_AND;
2114
2115     assert(is_ANYOF_SYNTHETIC(ssc));
2116
2117     cp_list = add_cp_to_invlist(cp_list, cp);
2118     ssc_intersection(ssc, cp_list,
2119                      FALSE /* Not inverted */
2120                      );
2121     SvREFCNT_dec_NN(cp_list);
2122 }
2123
2124 STATIC void
2125 S_ssc_clear_locale(regnode_ssc *ssc)
2126 {
2127     /* Set the SSC 'ssc' to not match any locale things */
2128     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2129
2130     assert(is_ANYOF_SYNTHETIC(ssc));
2131
2132     ANYOF_POSIXL_ZERO(ssc);
2133     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2134 }
2135
2136 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2137
2138 STATIC bool
2139 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2140 {
2141     /* The synthetic start class is used to hopefully quickly winnow down
2142      * places where a pattern could start a match in the target string.  If it
2143      * doesn't really narrow things down that much, there isn't much point to
2144      * having the overhead of using it.  This function uses some very crude
2145      * heuristics to decide if to use the ssc or not.
2146      *
2147      * It returns TRUE if 'ssc' rules out more than half what it considers to
2148      * be the "likely" possible matches, but of course it doesn't know what the
2149      * actual things being matched are going to be; these are only guesses
2150      *
2151      * For /l matches, it assumes that the only likely matches are going to be
2152      *      in the 0-255 range, uniformly distributed, so half of that is 127
2153      * For /a and /d matches, it assumes that the likely matches will be just
2154      *      the ASCII range, so half of that is 63
2155      * For /u and there isn't anything matching above the Latin1 range, it
2156      *      assumes that that is the only range likely to be matched, and uses
2157      *      half that as the cut-off: 127.  If anything matches above Latin1,
2158      *      it assumes that all of Unicode could match (uniformly), except for
2159      *      non-Unicode code points and things in the General Category "Other"
2160      *      (unassigned, private use, surrogates, controls and formats).  This
2161      *      is a much large number. */
2162
2163     U32 count = 0;      /* Running total of number of code points matched by
2164                            'ssc' */
2165     UV start, end;      /* Start and end points of current range in inversion
2166                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2167     const U32 max_code_points = (LOC)
2168                                 ?  256
2169                                 : ((  ! UNI_SEMANTICS
2170                                     ||  invlist_highest(ssc->invlist) < 256)
2171                                   ? 128
2172                                   : NON_OTHER_COUNT);
2173     const U32 max_match = max_code_points / 2;
2174
2175     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2176
2177     invlist_iterinit(ssc->invlist);
2178     while (invlist_iternext(ssc->invlist, &start, &end)) {
2179         if (start >= max_code_points) {
2180             break;
2181         }
2182         end = MIN(end, max_code_points - 1);
2183         count += end - start + 1;
2184         if (count >= max_match) {
2185             invlist_iterfinish(ssc->invlist);
2186             return FALSE;
2187         }
2188     }
2189
2190     return TRUE;
2191 }
2192
2193
2194 STATIC void
2195 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2196 {
2197     /* The inversion list in the SSC is marked mortal; now we need a more
2198      * permanent copy, which is stored the same way that is done in a regular
2199      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2200      * map */
2201
2202     SV* invlist = invlist_clone(ssc->invlist, NULL);
2203
2204     PERL_ARGS_ASSERT_SSC_FINALIZE;
2205
2206     assert(is_ANYOF_SYNTHETIC(ssc));
2207
2208     /* The code in this file assumes that all but these flags aren't relevant
2209      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2210      * by the time we reach here */
2211     assert(! (ANYOF_FLAGS(ssc)
2212         & ~( ANYOF_COMMON_FLAGS
2213             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2214             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2215
2216     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2217
2218     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2219     SvREFCNT_dec(invlist);
2220
2221     /* Make sure is clone-safe */
2222     ssc->invlist = NULL;
2223
2224     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2225         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2226         OP(ssc) = ANYOFPOSIXL;
2227     }
2228     else if (RExC_contains_locale) {
2229         OP(ssc) = ANYOFL;
2230     }
2231
2232     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2233 }
2234
2235 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2236 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2237 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2238 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2239                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2240                                : 0 )
2241
2242
2243 #ifdef DEBUGGING
2244 /*
2245    dump_trie(trie,widecharmap,revcharmap)
2246    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2247    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2248
2249    These routines dump out a trie in a somewhat readable format.
2250    The _interim_ variants are used for debugging the interim
2251    tables that are used to generate the final compressed
2252    representation which is what dump_trie expects.
2253
2254    Part of the reason for their existence is to provide a form
2255    of documentation as to how the different representations function.
2256
2257 */
2258
2259 /*
2260   Dumps the final compressed table form of the trie to Perl_debug_log.
2261   Used for debugging make_trie().
2262 */
2263
2264 STATIC void
2265 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2266             AV *revcharmap, U32 depth)
2267 {
2268     U32 state;
2269     SV *sv=sv_newmortal();
2270     int colwidth= widecharmap ? 6 : 4;
2271     U16 word;
2272     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2273
2274     PERL_ARGS_ASSERT_DUMP_TRIE;
2275
2276     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2277         depth+1, "Match","Base","Ofs" );
2278
2279     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2280         SV ** const tmp = av_fetch( revcharmap, state, 0);
2281         if ( tmp ) {
2282             Perl_re_printf( aTHX_  "%*s",
2283                 colwidth,
2284                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2285                             PL_colors[0], PL_colors[1],
2286                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2287                             PERL_PV_ESCAPE_FIRSTCHAR
2288                 )
2289             );
2290         }
2291     }
2292     Perl_re_printf( aTHX_  "\n");
2293     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2294
2295     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2296         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2297     Perl_re_printf( aTHX_  "\n");
2298
2299     for( state = 1 ; state < trie->statecount ; state++ ) {
2300         const U32 base = trie->states[ state ].trans.base;
2301
2302         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2303
2304         if ( trie->states[ state ].wordnum ) {
2305             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2306         } else {
2307             Perl_re_printf( aTHX_  "%6s", "" );
2308         }
2309
2310         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2311
2312         if ( base ) {
2313             U32 ofs = 0;
2314
2315             while( ( base + ofs  < trie->uniquecharcount ) ||
2316                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2317                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2318                                                                     != state))
2319                     ofs++;
2320
2321             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2322
2323             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2324                 if ( ( base + ofs >= trie->uniquecharcount )
2325                         && ( base + ofs - trie->uniquecharcount
2326                                                         < trie->lasttrans )
2327                         && trie->trans[ base + ofs
2328                                     - trie->uniquecharcount ].check == state )
2329                 {
2330                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2331                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2332                    );
2333                 } else {
2334                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2335                 }
2336             }
2337
2338             Perl_re_printf( aTHX_  "]");
2339
2340         }
2341         Perl_re_printf( aTHX_  "\n" );
2342     }
2343     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2344                                 depth);
2345     for (word=1; word <= trie->wordcount; word++) {
2346         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2347             (int)word, (int)(trie->wordinfo[word].prev),
2348             (int)(trie->wordinfo[word].len));
2349     }
2350     Perl_re_printf( aTHX_  "\n" );
2351 }
2352 /*
2353   Dumps a fully constructed but uncompressed trie in list form.
2354   List tries normally only are used for construction when the number of
2355   possible chars (trie->uniquecharcount) is very high.
2356   Used for debugging make_trie().
2357 */
2358 STATIC void
2359 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2360                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2361                          U32 depth)
2362 {
2363     U32 state;
2364     SV *sv=sv_newmortal();
2365     int colwidth= widecharmap ? 6 : 4;
2366     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2367
2368     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2369
2370     /* print out the table precompression.  */
2371     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2372             depth+1 );
2373     Perl_re_indentf( aTHX_  "%s",
2374             depth+1, "------:-----+-----------------\n" );
2375
2376     for( state=1 ; state < next_alloc ; state ++ ) {
2377         U16 charid;
2378
2379         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2380             depth+1, (UV)state  );
2381         if ( ! trie->states[ state ].wordnum ) {
2382             Perl_re_printf( aTHX_  "%5s| ","");
2383         } else {
2384             Perl_re_printf( aTHX_  "W%4x| ",
2385                 trie->states[ state ].wordnum
2386             );
2387         }
2388         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2389             SV ** const tmp = av_fetch( revcharmap,
2390                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2391             if ( tmp ) {
2392                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2393                     colwidth,
2394                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2395                               colwidth,
2396                               PL_colors[0], PL_colors[1],
2397                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2398                               | PERL_PV_ESCAPE_FIRSTCHAR
2399                     ) ,
2400                     TRIE_LIST_ITEM(state, charid).forid,
2401                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2402                 );
2403                 if (!(charid % 10))
2404                     Perl_re_printf( aTHX_  "\n%*s| ",
2405                         (int)((depth * 2) + 14), "");
2406             }
2407         }
2408         Perl_re_printf( aTHX_  "\n");
2409     }
2410 }
2411
2412 /*
2413   Dumps a fully constructed but uncompressed trie in table form.
2414   This is the normal DFA style state transition table, with a few
2415   twists to facilitate compression later.
2416   Used for debugging make_trie().
2417 */
2418 STATIC void
2419 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2420                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2421                           U32 depth)
2422 {
2423     U32 state;
2424     U16 charid;
2425     SV *sv=sv_newmortal();
2426     int colwidth= widecharmap ? 6 : 4;
2427     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2428
2429     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2430
2431     /*
2432        print out the table precompression so that we can do a visual check
2433        that they are identical.
2434      */
2435
2436     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2437
2438     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2439         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2440         if ( tmp ) {
2441             Perl_re_printf( aTHX_  "%*s",
2442                 colwidth,
2443                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2444                             PL_colors[0], PL_colors[1],
2445                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2446                             PERL_PV_ESCAPE_FIRSTCHAR
2447                 )
2448             );
2449         }
2450     }
2451
2452     Perl_re_printf( aTHX_ "\n");
2453     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2454
2455     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2456         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2457     }
2458
2459     Perl_re_printf( aTHX_  "\n" );
2460
2461     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2462
2463         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2464             depth+1,
2465             (UV)TRIE_NODENUM( state ) );
2466
2467         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2468             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2469             if (v)
2470                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2471             else
2472                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2473         }
2474         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2475             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2476                                             (UV)trie->trans[ state ].check );
2477         } else {
2478             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2479                                             (UV)trie->trans[ state ].check,
2480             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2481         }
2482     }
2483 }
2484
2485 #endif
2486
2487
2488 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2489   startbranch: the first branch in the whole branch sequence
2490   first      : start branch of sequence of branch-exact nodes.
2491                May be the same as startbranch
2492   last       : Thing following the last branch.
2493                May be the same as tail.
2494   tail       : item following the branch sequence
2495   count      : words in the sequence
2496   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2497   depth      : indent depth
2498
2499 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2500
2501 A trie is an N'ary tree where the branches are determined by digital
2502 decomposition of the key. IE, at the root node you look up the 1st character and
2503 follow that branch repeat until you find the end of the branches. Nodes can be
2504 marked as "accepting" meaning they represent a complete word. Eg:
2505
2506   /he|she|his|hers/
2507
2508 would convert into the following structure. Numbers represent states, letters
2509 following numbers represent valid transitions on the letter from that state, if
2510 the number is in square brackets it represents an accepting state, otherwise it
2511 will be in parenthesis.
2512
2513       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2514       |    |
2515       |   (2)
2516       |    |
2517      (1)   +-i->(6)-+-s->[7]
2518       |
2519       +-s->(3)-+-h->(4)-+-e->[5]
2520
2521       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2522
2523 This shows that when matching against the string 'hers' we will begin at state 1
2524 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2525 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2526 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2527 single traverse. We store a mapping from accepting to state to which word was
2528 matched, and then when we have multiple possibilities we try to complete the
2529 rest of the regex in the order in which they occurred in the alternation.
2530
2531 The only prior NFA like behaviour that would be changed by the TRIE support is
2532 the silent ignoring of duplicate alternations which are of the form:
2533
2534  / (DUPE|DUPE) X? (?{ ... }) Y /x
2535
2536 Thus EVAL blocks following a trie may be called a different number of times with
2537 and without the optimisation. With the optimisations dupes will be silently
2538 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2539 the following demonstrates:
2540
2541  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2542
2543 which prints out 'word' three times, but
2544
2545  'words'=~/(word|word|word)(?{ print $1 })S/
2546
2547 which doesnt print it out at all. This is due to other optimisations kicking in.
2548
2549 Example of what happens on a structural level:
2550
2551 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2552
2553    1: CURLYM[1] {1,32767}(18)
2554    5:   BRANCH(8)
2555    6:     EXACT <ac>(16)
2556    8:   BRANCH(11)
2557    9:     EXACT <ad>(16)
2558   11:   BRANCH(14)
2559   12:     EXACT <ab>(16)
2560   16:   SUCCEED(0)
2561   17:   NOTHING(18)
2562   18: END(0)
2563
2564 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2565 and should turn into:
2566
2567    1: CURLYM[1] {1,32767}(18)
2568    5:   TRIE(16)
2569         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2570           <ac>
2571           <ad>
2572           <ab>
2573   16:   SUCCEED(0)
2574   17:   NOTHING(18)
2575   18: END(0)
2576
2577 Cases where tail != last would be like /(?foo|bar)baz/:
2578
2579    1: BRANCH(4)
2580    2:   EXACT <foo>(8)
2581    4: BRANCH(7)
2582    5:   EXACT <bar>(8)
2583    7: TAIL(8)
2584    8: EXACT <baz>(10)
2585   10: END(0)
2586
2587 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2588 and would end up looking like:
2589
2590     1: TRIE(8)
2591       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2592         <foo>
2593         <bar>
2594    7: TAIL(8)
2595    8: EXACT <baz>(10)
2596   10: END(0)
2597
2598     d = uvchr_to_utf8_flags(d, uv, 0);
2599
2600 is the recommended Unicode-aware way of saying
2601
2602     *(d++) = uv;
2603 */
2604
2605 #define TRIE_STORE_REVCHAR(val)                                            \
2606     STMT_START {                                                           \
2607         if (UTF) {                                                         \
2608             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2609             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2610             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2611             *kapow = '\0';                                                 \
2612             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2613             SvPOK_on(zlopp);                                               \
2614             SvUTF8_on(zlopp);                                              \
2615             av_push(revcharmap, zlopp);                                    \
2616         } else {                                                           \
2617             char ooooff = (char)val;                                           \
2618             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2619         }                                                                  \
2620         } STMT_END
2621
2622 /* This gets the next character from the input, folding it if not already
2623  * folded. */
2624 #define TRIE_READ_CHAR STMT_START {                                           \
2625     wordlen++;                                                                \
2626     if ( UTF ) {                                                              \
2627         /* if it is UTF then it is either already folded, or does not need    \
2628          * folding */                                                         \
2629         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2630     }                                                                         \
2631     else if (folder == PL_fold_latin1) {                                      \
2632         /* This folder implies Unicode rules, which in the range expressible  \
2633          *  by not UTF is the lower case, with the two exceptions, one of     \
2634          *  which should have been taken care of before calling this */       \
2635         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2636         uvc = toLOWER_L1(*uc);                                                \
2637         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2638         len = 1;                                                              \
2639     } else {                                                                  \
2640         /* raw data, will be folded later if needed */                        \
2641         uvc = (U32)*uc;                                                       \
2642         len = 1;                                                              \
2643     }                                                                         \
2644 } STMT_END
2645
2646
2647
2648 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2649     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2650         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2651         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2652         TRIE_LIST_LEN( state ) = ging;                          \
2653     }                                                           \
2654     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2655     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2656     TRIE_LIST_CUR( state )++;                                   \
2657 } STMT_END
2658
2659 #define TRIE_LIST_NEW(state) STMT_START {                       \
2660     Newx( trie->states[ state ].trans.list,                     \
2661         4, reg_trie_trans_le );                                 \
2662      TRIE_LIST_CUR( state ) = 1;                                \
2663      TRIE_LIST_LEN( state ) = 4;                                \
2664 } STMT_END
2665
2666 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2667     U16 dupe= trie->states[ state ].wordnum;                    \
2668     regnode * const noper_next = regnext( noper );              \
2669                                                                 \
2670     DEBUG_r({                                                   \
2671         /* store the word for dumping */                        \
2672         SV* tmp;                                                \
2673         if (OP(noper) != NOTHING)                               \
2674             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2675         else                                                    \
2676             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2677         av_push( trie_words, tmp );                             \
2678     });                                                         \
2679                                                                 \
2680     curword++;                                                  \
2681     trie->wordinfo[curword].prev   = 0;                         \
2682     trie->wordinfo[curword].len    = wordlen;                   \
2683     trie->wordinfo[curword].accept = state;                     \
2684                                                                 \
2685     if ( noper_next < tail ) {                                  \
2686         if (!trie->jump)                                        \
2687             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2688                                                  sizeof(U16) ); \
2689         trie->jump[curword] = (U16)(noper_next - convert);      \
2690         if (!jumper)                                            \
2691             jumper = noper_next;                                \
2692         if (!nextbranch)                                        \
2693             nextbranch= regnext(cur);                           \
2694     }                                                           \
2695                                                                 \
2696     if ( dupe ) {                                               \
2697         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2698         /* chain, so that when the bits of chain are later    */\
2699         /* linked together, the dups appear in the chain      */\
2700         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2701         trie->wordinfo[dupe].prev = curword;                    \
2702     } else {                                                    \
2703         /* we haven't inserted this word yet.                */ \
2704         trie->states[ state ].wordnum = curword;                \
2705     }                                                           \
2706 } STMT_END
2707
2708
2709 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2710      ( ( base + charid >=  ucharcount                                   \
2711          && base + charid < ubound                                      \
2712          && state == trie->trans[ base - ucharcount + charid ].check    \
2713          && trie->trans[ base - ucharcount + charid ].next )            \
2714            ? trie->trans[ base - ucharcount + charid ].next             \
2715            : ( state==1 ? special : 0 )                                 \
2716       )
2717
2718 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2719 STMT_START {                                                \
2720     TRIE_BITMAP_SET(trie, uvc);                             \
2721     /* store the folded codepoint */                        \
2722     if ( folder )                                           \
2723         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2724                                                             \
2725     if ( !UTF ) {                                           \
2726         /* store first byte of utf8 representation of */    \
2727         /* variant codepoints */                            \
2728         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2729             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2730         }                                                   \
2731     }                                                       \
2732 } STMT_END
2733 #define MADE_TRIE       1
2734 #define MADE_JUMP_TRIE  2
2735 #define MADE_EXACT_TRIE 4
2736
2737 STATIC I32
2738 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2739                   regnode *first, regnode *last, regnode *tail,
2740                   U32 word_count, U32 flags, U32 depth)
2741 {
2742     /* first pass, loop through and scan words */
2743     reg_trie_data *trie;
2744     HV *widecharmap = NULL;
2745     AV *revcharmap = newAV();
2746     regnode *cur;
2747     STRLEN len = 0;
2748     UV uvc = 0;
2749     U16 curword = 0;
2750     U32 next_alloc = 0;
2751     regnode *jumper = NULL;
2752     regnode *nextbranch = NULL;
2753     regnode *convert = NULL;
2754     U32 *prev_states; /* temp array mapping each state to previous one */
2755     /* we just use folder as a flag in utf8 */
2756     const U8 * folder = NULL;
2757
2758     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2759      * which stands for one trie structure, one hash, optionally followed
2760      * by two arrays */
2761 #ifdef DEBUGGING
2762     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2763     AV *trie_words = NULL;
2764     /* along with revcharmap, this only used during construction but both are
2765      * useful during debugging so we store them in the struct when debugging.
2766      */
2767 #else
2768     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2769     STRLEN trie_charcount=0;
2770 #endif
2771     SV *re_trie_maxbuff;
2772     DECLARE_AND_GET_RE_DEBUG_FLAGS;
2773
2774     PERL_ARGS_ASSERT_MAKE_TRIE;
2775 #ifndef DEBUGGING
2776     PERL_UNUSED_ARG(depth);
2777 #endif
2778
2779     switch (flags) {
2780         case EXACT: case EXACT_REQ8: case EXACTL: break;
2781         case EXACTFAA:
2782         case EXACTFUP:
2783         case EXACTFU:
2784         case EXACTFLU8: folder = PL_fold_latin1; break;
2785         case EXACTF:  folder = PL_fold; break;
2786         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2787     }
2788
2789     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2790     trie->refcount = 1;
2791     trie->startstate = 1;
2792     trie->wordcount = word_count;
2793     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2794     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2795     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2796         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2797     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2798                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2799
2800     DEBUG_r({
2801         trie_words = newAV();
2802     });
2803
2804     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2805     assert(re_trie_maxbuff);
2806     if (!SvIOK(re_trie_maxbuff)) {
2807         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2808     }
2809     DEBUG_TRIE_COMPILE_r({
2810         Perl_re_indentf( aTHX_
2811           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2812           depth+1,
2813           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2814           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2815     });
2816
2817    /* Find the node we are going to overwrite */
2818     if ( first == startbranch && OP( last ) != BRANCH ) {
2819         /* whole branch chain */
2820         convert = first;
2821     } else {
2822         /* branch sub-chain */
2823         convert = NEXTOPER( first );
2824     }
2825
2826     /*  -- First loop and Setup --
2827
2828        We first traverse the branches and scan each word to determine if it
2829        contains widechars, and how many unique chars there are, this is
2830        important as we have to build a table with at least as many columns as we
2831        have unique chars.
2832
2833        We use an array of integers to represent the character codes 0..255
2834        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2835        the native representation of the character value as the key and IV's for
2836        the coded index.
2837
2838        *TODO* If we keep track of how many times each character is used we can
2839        remap the columns so that the table compression later on is more
2840        efficient in terms of memory by ensuring the most common value is in the
2841        middle and the least common are on the outside.  IMO this would be better
2842        than a most to least common mapping as theres a decent chance the most
2843        common letter will share a node with the least common, meaning the node
2844        will not be compressible. With a middle is most common approach the worst
2845        case is when we have the least common nodes twice.
2846
2847      */
2848
2849     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2850         regnode *noper = NEXTOPER( cur );
2851         const U8 *uc;
2852         const U8 *e;
2853         int foldlen = 0;
2854         U32 wordlen      = 0;         /* required init */
2855         STRLEN minchars = 0;
2856         STRLEN maxchars = 0;
2857         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2858                                                bitmap?*/
2859
2860         if (OP(noper) == NOTHING) {
2861             /* skip past a NOTHING at the start of an alternation
2862              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2863              *
2864              * If the next node is not something we are supposed to process
2865              * we will just ignore it due to the condition guarding the
2866              * next block.
2867              */
2868
2869             regnode *noper_next= regnext(noper);
2870             if (noper_next < tail)
2871                 noper= noper_next;
2872         }
2873
2874         if (    noper < tail
2875             && (    OP(noper) == flags
2876                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2877                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
2878                                          || OP(noper) == EXACTFUP))))
2879         {
2880             uc= (U8*)STRING(noper);
2881             e= uc + STR_LEN(noper);
2882         } else {
2883             trie->minlen= 0;
2884             continue;
2885         }
2886
2887
2888         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2889             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2890                                           regardless of encoding */
2891             if (OP( noper ) == EXACTFUP) {
2892                 /* false positives are ok, so just set this */
2893                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2894             }
2895         }
2896
2897         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2898                                            branch */
2899             TRIE_CHARCOUNT(trie)++;
2900             TRIE_READ_CHAR;
2901
2902             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2903              * is in effect.  Under /i, this character can match itself, or
2904              * anything that folds to it.  If not under /i, it can match just
2905              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2906              * all fold to k, and all are single characters.   But some folds
2907              * expand to more than one character, so for example LATIN SMALL
2908              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2909              * the string beginning at 'uc' is 'ffi', it could be matched by
2910              * three characters, or just by the one ligature character. (It
2911              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2912              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2913              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2914              * match.)  The trie needs to know the minimum and maximum number
2915              * of characters that could match so that it can use size alone to
2916              * quickly reject many match attempts.  The max is simple: it is
2917              * the number of folded characters in this branch (since a fold is
2918              * never shorter than what folds to it. */
2919
2920             maxchars++;
2921
2922             /* And the min is equal to the max if not under /i (indicated by
2923              * 'folder' being NULL), or there are no multi-character folds.  If
2924              * there is a multi-character fold, the min is incremented just
2925              * once, for the character that folds to the sequence.  Each
2926              * character in the sequence needs to be added to the list below of
2927              * characters in the trie, but we count only the first towards the
2928              * min number of characters needed.  This is done through the
2929              * variable 'foldlen', which is returned by the macros that look
2930              * for these sequences as the number of bytes the sequence
2931              * occupies.  Each time through the loop, we decrement 'foldlen' by
2932              * how many bytes the current char occupies.  Only when it reaches
2933              * 0 do we increment 'minchars' or look for another multi-character
2934              * sequence. */
2935             if (folder == NULL) {
2936                 minchars++;
2937             }
2938             else if (foldlen > 0) {
2939                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2940             }
2941             else {
2942                 minchars++;
2943
2944                 /* See if *uc is the beginning of a multi-character fold.  If
2945                  * so, we decrement the length remaining to look at, to account
2946                  * for the current character this iteration.  (We can use 'uc'
2947                  * instead of the fold returned by TRIE_READ_CHAR because the
2948                  * macro is smart enough to account for any unfolded
2949                  * characters. */
2950                 if (UTF) {
2951                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2952                         foldlen -= UTF8SKIP(uc);
2953                     }
2954                 }
2955                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2956                     foldlen--;
2957                 }
2958             }
2959
2960             /* The current character (and any potential folds) should be added
2961              * to the possible matching characters for this position in this
2962              * branch */
2963             if ( uvc < 256 ) {
2964                 if ( folder ) {
2965                     U8 folded= folder[ (U8) uvc ];
2966                     if ( !trie->charmap[ folded ] ) {
2967                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2968                         TRIE_STORE_REVCHAR( folded );
2969                     }
2970                 }
2971                 if ( !trie->charmap[ uvc ] ) {
2972                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2973                     TRIE_STORE_REVCHAR( uvc );
2974                 }
2975                 if ( set_bit ) {
2976                     /* store the codepoint in the bitmap, and its folded
2977                      * equivalent. */
2978                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2979                     set_bit = 0; /* We've done our bit :-) */
2980                 }
2981             } else {
2982
2983                 /* XXX We could come up with the list of code points that fold
2984                  * to this using PL_utf8_foldclosures, except not for
2985                  * multi-char folds, as there may be multiple combinations
2986                  * there that could work, which needs to wait until runtime to
2987                  * resolve (The comment about LIGATURE FFI above is such an
2988                  * example */
2989
2990                 SV** svpp;
2991                 if ( !widecharmap )
2992                     widecharmap = newHV();
2993
2994                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2995
2996                 if ( !svpp )
2997                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2998
2999                 if ( !SvTRUE( *svpp ) ) {
3000                     sv_setiv( *svpp, ++trie->uniquecharcount );
3001                     TRIE_STORE_REVCHAR(uvc);
3002                 }
3003             }
3004         } /* end loop through characters in this branch of the trie */
3005
3006         /* We take the min and max for this branch and combine to find the min
3007          * and max for all branches processed so far */
3008         if( cur == first ) {
3009             trie->minlen = minchars;
3010             trie->maxlen = maxchars;
3011         } else if (minchars < trie->minlen) {
3012             trie->minlen = minchars;
3013         } else if (maxchars > trie->maxlen) {
3014             trie->maxlen = maxchars;
3015         }
3016     } /* end first pass */
3017     DEBUG_TRIE_COMPILE_r(
3018         Perl_re_indentf( aTHX_
3019                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3020                 depth+1,
3021                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3022                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3023                 (int)trie->minlen, (int)trie->maxlen )
3024     );
3025
3026     /*
3027         We now know what we are dealing with in terms of unique chars and
3028         string sizes so we can calculate how much memory a naive
3029         representation using a flat table  will take. If it's over a reasonable
3030         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3031         conservative but potentially much slower representation using an array
3032         of lists.
3033
3034         At the end we convert both representations into the same compressed
3035         form that will be used in regexec.c for matching with. The latter
3036         is a form that cannot be used to construct with but has memory
3037         properties similar to the list form and access properties similar
3038         to the table form making it both suitable for fast searches and
3039         small enough that its feasable to store for the duration of a program.
3040
3041         See the comment in the code where the compressed table is produced
3042         inplace from the flat tabe representation for an explanation of how
3043         the compression works.
3044
3045     */
3046
3047
3048     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3049     prev_states[1] = 0;
3050
3051     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3052                                                     > SvIV(re_trie_maxbuff) )
3053     {
3054         /*
3055             Second Pass -- Array Of Lists Representation
3056
3057             Each state will be represented by a list of charid:state records
3058             (reg_trie_trans_le) the first such element holds the CUR and LEN
3059             points of the allocated array. (See defines above).
3060
3061             We build the initial structure using the lists, and then convert
3062             it into the compressed table form which allows faster lookups
3063             (but cant be modified once converted).
3064         */
3065
3066         STRLEN transcount = 1;
3067
3068         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
3069             depth+1));
3070
3071         trie->states = (reg_trie_state *)
3072             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3073                                   sizeof(reg_trie_state) );
3074         TRIE_LIST_NEW(1);
3075         next_alloc = 2;
3076
3077         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3078
3079             regnode *noper   = NEXTOPER( cur );
3080             U32 state        = 1;         /* required init */
3081             U16 charid       = 0;         /* sanity init */
3082             U32 wordlen      = 0;         /* required init */
3083
3084             if (OP(noper) == NOTHING) {
3085                 regnode *noper_next= regnext(noper);
3086                 if (noper_next < tail)
3087                     noper= noper_next;
3088                 /* we will undo this assignment if noper does not
3089                  * point at a trieable type in the else clause of
3090                  * the following statement. */
3091             }
3092
3093             if (    noper < tail
3094                 && (    OP(noper) == flags
3095                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3096                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3097                                              || OP(noper) == EXACTFUP))))
3098             {
3099                 const U8 *uc= (U8*)STRING(noper);
3100                 const U8 *e= uc + STR_LEN(noper);
3101
3102                 for ( ; uc < e ; uc += len ) {
3103
3104                     TRIE_READ_CHAR;
3105
3106                     if ( uvc < 256 ) {
3107                         charid = trie->charmap[ uvc ];
3108                     } else {
3109                         SV** const svpp = hv_fetch( widecharmap,
3110                                                     (char*)&uvc,
3111                                                     sizeof( UV ),
3112                                                     0);
3113                         if ( !svpp ) {
3114                             charid = 0;
3115                         } else {
3116                             charid=(U16)SvIV( *svpp );
3117                         }
3118                     }
3119                     /* charid is now 0 if we dont know the char read, or
3120                      * nonzero if we do */
3121                     if ( charid ) {
3122
3123                         U16 check;
3124                         U32 newstate = 0;
3125
3126                         charid--;
3127                         if ( !trie->states[ state ].trans.list ) {
3128                             TRIE_LIST_NEW( state );
3129                         }
3130                         for ( check = 1;
3131                               check <= TRIE_LIST_USED( state );
3132                               check++ )
3133                         {
3134                             if ( TRIE_LIST_ITEM( state, check ).forid
3135                                                                     == charid )
3136                             {
3137                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3138                                 break;
3139                             }
3140                         }
3141                         if ( ! newstate ) {
3142                             newstate = next_alloc++;
3143                             prev_states[newstate] = state;
3144                             TRIE_LIST_PUSH( state, charid, newstate );
3145                             transcount++;
3146                         }
3147                         state = newstate;
3148                     } else {
3149                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3150                     }
3151                 }
3152             } else {
3153                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3154                  * on a trieable type. So we need to reset noper back to point at the first regop
3155                  * in the branch before we call TRIE_HANDLE_WORD()
3156                 */
3157                 noper= NEXTOPER(cur);
3158             }
3159             TRIE_HANDLE_WORD(state);
3160
3161         } /* end second pass */
3162
3163         /* next alloc is the NEXT state to be allocated */
3164         trie->statecount = next_alloc;
3165         trie->states = (reg_trie_state *)
3166             PerlMemShared_realloc( trie->states,
3167                                    next_alloc
3168                                    * sizeof(reg_trie_state) );
3169
3170         /* and now dump it out before we compress it */
3171         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3172                                                          revcharmap, next_alloc,
3173                                                          depth+1)
3174         );
3175
3176         trie->trans = (reg_trie_trans *)
3177             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3178         {
3179             U32 state;
3180             U32 tp = 0;
3181             U32 zp = 0;
3182
3183
3184             for( state=1 ; state < next_alloc ; state ++ ) {
3185                 U32 base=0;
3186
3187                 /*
3188                 DEBUG_TRIE_COMPILE_MORE_r(
3189                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3190                 );
3191                 */
3192
3193                 if (trie->states[state].trans.list) {
3194                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3195                     U16 maxid=minid;
3196                     U16 idx;
3197
3198                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3199                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3200                         if ( forid < minid ) {
3201                             minid=forid;
3202                         } else if ( forid > maxid ) {
3203                             maxid=forid;
3204                         }
3205                     }
3206                     if ( transcount < tp + maxid - minid + 1) {
3207                         transcount *= 2;
3208                         trie->trans = (reg_trie_trans *)
3209                             PerlMemShared_realloc( trie->trans,
3210                                                      transcount
3211                                                      * sizeof(reg_trie_trans) );
3212                         Zero( trie->trans + (transcount / 2),
3213                               transcount / 2,
3214                               reg_trie_trans );
3215                     }
3216                     base = trie->uniquecharcount + tp - minid;
3217                     if ( maxid == minid ) {
3218                         U32 set = 0;
3219                         for ( ; zp < tp ; zp++ ) {
3220                             if ( ! trie->trans[ zp ].next ) {
3221                                 base = trie->uniquecharcount + zp - minid;
3222                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3223                                                                    1).newstate;
3224                                 trie->trans[ zp ].check = state;
3225                                 set = 1;
3226                                 break;
3227                             }
3228                         }
3229                         if ( !set ) {
3230                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3231                                                                    1).newstate;
3232                             trie->trans[ tp ].check = state;
3233                             tp++;
3234                             zp = tp;
3235                         }
3236                     } else {
3237                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3238                             const U32 tid = base
3239                                            - trie->uniquecharcount
3240                                            + TRIE_LIST_ITEM( state, idx ).forid;
3241                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3242                                                                 idx ).newstate;
3243                             trie->trans[ tid ].check = state;
3244                         }
3245                         tp += ( maxid - minid + 1 );
3246                     }
3247                     Safefree(trie->states[ state ].trans.list);
3248                 }
3249                 /*
3250                 DEBUG_TRIE_COMPILE_MORE_r(
3251                     Perl_re_printf( aTHX_  " base: %d\n",base);
3252                 );
3253                 */
3254                 trie->states[ state ].trans.base=base;
3255             }
3256             trie->lasttrans = tp + 1;
3257         }
3258     } else {
3259         /*
3260            Second Pass -- Flat Table Representation.
3261
3262            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3263            each.  We know that we will need Charcount+1 trans at most to store
3264            the data (one row per char at worst case) So we preallocate both
3265            structures assuming worst case.
3266
3267            We then construct the trie using only the .next slots of the entry
3268            structs.
3269
3270            We use the .check field of the first entry of the node temporarily
3271            to make compression both faster and easier by keeping track of how
3272            many non zero fields are in the node.
3273
3274            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3275            transition.
3276
3277            There are two terms at use here: state as a TRIE_NODEIDX() which is
3278            a number representing the first entry of the node, and state as a
3279            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3280            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3281            if there are 2 entrys per node. eg:
3282
3283              A B       A B
3284           1. 2 4    1. 3 7
3285           2. 0 3    3. 0 5
3286           3. 0 0    5. 0 0
3287           4. 0 0    7. 0 0
3288
3289            The table is internally in the right hand, idx form. However as we
3290            also have to deal with the states array which is indexed by nodenum
3291            we have to use TRIE_NODENUM() to convert.
3292
3293         */
3294         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3295             depth+1));
3296
3297         trie->trans = (reg_trie_trans *)
3298             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3299                                   * trie->uniquecharcount + 1,
3300                                   sizeof(reg_trie_trans) );
3301         trie->states = (reg_trie_state *)
3302             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3303                                   sizeof(reg_trie_state) );
3304         next_alloc = trie->uniquecharcount + 1;
3305
3306
3307         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3308
3309             regnode *noper   = NEXTOPER( cur );
3310
3311             U32 state        = 1;         /* required init */
3312
3313             U16 charid       = 0;         /* sanity init */
3314             U32 accept_state = 0;         /* sanity init */
3315
3316             U32 wordlen      = 0;         /* required init */
3317
3318             if (OP(noper) == NOTHING) {
3319                 regnode *noper_next= regnext(noper);
3320                 if (noper_next < tail)
3321                     noper= noper_next;
3322                 /* we will undo this assignment if noper does not
3323                  * point at a trieable type in the else clause of
3324                  * the following statement. */
3325             }
3326
3327             if (    noper < tail
3328                 && (    OP(noper) == flags
3329                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3330                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3331                                              || OP(noper) == EXACTFUP))))
3332             {
3333                 const U8 *uc= (U8*)STRING(noper);
3334                 const U8 *e= uc + STR_LEN(noper);
3335
3336                 for ( ; uc < e ; uc += len ) {
3337
3338                     TRIE_READ_CHAR;
3339
3340                     if ( uvc < 256 ) {
3341                         charid = trie->charmap[ uvc ];
3342                     } else {
3343                         SV* const * const svpp = hv_fetch( widecharmap,
3344                                                            (char*)&uvc,
3345                                                            sizeof( UV ),
3346                                                            0);
3347                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3348                     }
3349                     if ( charid ) {
3350                         charid--;
3351                         if ( !trie->trans[ state + charid ].next ) {
3352                             trie->trans[ state + charid ].next = next_alloc;
3353                             trie->trans[ state ].check++;
3354                             prev_states[TRIE_NODENUM(next_alloc)]
3355                                     = TRIE_NODENUM(state);
3356                             next_alloc += trie->uniquecharcount;
3357                         }
3358                         state = trie->trans[ state + charid ].next;
3359                     } else {
3360                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3361                     }
3362                     /* charid is now 0 if we dont know the char read, or
3363                      * nonzero if we do */
3364                 }
3365             } else {
3366                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3367                  * on a trieable type. So we need to reset noper back to point at the first regop
3368                  * in the branch before we call TRIE_HANDLE_WORD().
3369                 */
3370                 noper= NEXTOPER(cur);
3371             }
3372             accept_state = TRIE_NODENUM( state );
3373             TRIE_HANDLE_WORD(accept_state);
3374
3375         } /* end second pass */
3376
3377         /* and now dump it out before we compress it */
3378         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3379                                                           revcharmap,
3380                                                           next_alloc, depth+1));
3381
3382         {
3383         /*
3384            * Inplace compress the table.*
3385
3386            For sparse data sets the table constructed by the trie algorithm will
3387            be mostly 0/FAIL transitions or to put it another way mostly empty.
3388            (Note that leaf nodes will not contain any transitions.)
3389
3390            This algorithm compresses the tables by eliminating most such
3391            transitions, at the cost of a modest bit of extra work during lookup:
3392
3393            - Each states[] entry contains a .base field which indicates the
3394            index in the state[] array wheres its transition data is stored.
3395
3396            - If .base is 0 there are no valid transitions from that node.
3397
3398            - If .base is nonzero then charid is added to it to find an entry in
3399            the trans array.
3400
3401            -If trans[states[state].base+charid].check!=state then the
3402            transition is taken to be a 0/Fail transition. Thus if there are fail
3403            transitions at the front of the node then the .base offset will point
3404            somewhere inside the previous nodes data (or maybe even into a node
3405            even earlier), but the .check field determines if the transition is
3406            valid.
3407
3408            XXX - wrong maybe?
3409            The following process inplace converts the table to the compressed
3410            table: We first do not compress the root node 1,and mark all its
3411            .check pointers as 1 and set its .base pointer as 1 as well. This
3412            allows us to do a DFA construction from the compressed table later,
3413            and ensures that any .base pointers we calculate later are greater
3414            than 0.
3415
3416            - We set 'pos' to indicate the first entry of the second node.
3417
3418            - We then iterate over the columns of the node, finding the first and
3419            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3420            and set the .check pointers accordingly, and advance pos
3421            appropriately and repreat for the next node. Note that when we copy
3422            the next pointers we have to convert them from the original
3423            NODEIDX form to NODENUM form as the former is not valid post
3424            compression.
3425
3426            - If a node has no transitions used we mark its base as 0 and do not
3427            advance the pos pointer.
3428
3429            - If a node only has one transition we use a second pointer into the
3430            structure to fill in allocated fail transitions from other states.
3431            This pointer is independent of the main pointer and scans forward
3432            looking for null transitions that are allocated to a state. When it
3433            finds one it writes the single transition into the "hole".  If the
3434            pointer doesnt find one the single transition is appended as normal.
3435
3436            - Once compressed we can Renew/realloc the structures to release the
3437            excess space.
3438
3439            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3440            specifically Fig 3.47 and the associated pseudocode.
3441
3442            demq
3443         */
3444         const U32 laststate = TRIE_NODENUM( next_alloc );
3445         U32 state, charid;
3446         U32 pos = 0, zp=0;
3447         trie->statecount = laststate;
3448
3449         for ( state = 1 ; state < laststate ; state++ ) {
3450             U8 flag = 0;
3451             const U32 stateidx = TRIE_NODEIDX( state );
3452             const U32 o_used = trie->trans[ stateidx ].check;
3453             U32 used = trie->trans[ stateidx ].check;
3454             trie->trans[ stateidx ].check = 0;
3455
3456             for ( charid = 0;
3457                   used && charid < trie->uniquecharcount;
3458                   charid++ )
3459             {
3460                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3461                     if ( trie->trans[ stateidx + charid ].next ) {
3462                         if (o_used == 1) {
3463                             for ( ; zp < pos ; zp++ ) {
3464                                 if ( ! trie->trans[ zp ].next ) {
3465                                     break;
3466                                 }
3467                             }
3468                             trie->states[ state ].trans.base
3469                                                     = zp
3470                                                       + trie->uniquecharcount
3471                                                       - charid ;
3472                             trie->trans[ zp ].next
3473                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3474                                                              + charid ].next );
3475                             trie->trans[ zp ].check = state;
3476                             if ( ++zp > pos ) pos = zp;
3477                             break;
3478                         }
3479                         used--;
3480                     }
3481                     if ( !flag ) {
3482                         flag = 1;
3483                         trie->states[ state ].trans.base
3484                                        = pos + trie->uniquecharcount - charid ;
3485                     }
3486                     trie->trans[ pos ].next
3487                         = SAFE_TRIE_NODENUM(
3488                                        trie->trans[ stateidx + charid ].next );
3489                     trie->trans[ pos ].check = state;
3490                     pos++;
3491                 }
3492             }
3493         }
3494         trie->lasttrans = pos + 1;
3495         trie->states = (reg_trie_state *)
3496             PerlMemShared_realloc( trie->states, laststate
3497                                    * sizeof(reg_trie_state) );
3498         DEBUG_TRIE_COMPILE_MORE_r(
3499             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3500                 depth+1,
3501                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3502                        + 1 ),
3503                 (IV)next_alloc,
3504                 (IV)pos,
3505                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3506             );
3507
3508         } /* end table compress */
3509     }
3510     DEBUG_TRIE_COMPILE_MORE_r(
3511             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3512                 depth+1,
3513                 (UV)trie->statecount,
3514                 (UV)trie->lasttrans)
3515     );
3516     /* resize the trans array to remove unused space */
3517     trie->trans = (reg_trie_trans *)
3518         PerlMemShared_realloc( trie->trans, trie->lasttrans
3519                                * sizeof(reg_trie_trans) );
3520
3521     {   /* Modify the program and insert the new TRIE node */
3522         U8 nodetype =(U8)(flags & 0xFF);
3523         char *str=NULL;
3524
3525 #ifdef DEBUGGING
3526         regnode *optimize = NULL;
3527 #ifdef RE_TRACK_PATTERN_OFFSETS
3528
3529         U32 mjd_offset = 0;
3530         U32 mjd_nodelen = 0;
3531 #endif /* RE_TRACK_PATTERN_OFFSETS */
3532 #endif /* DEBUGGING */
3533         /*
3534            This means we convert either the first branch or the first Exact,
3535            depending on whether the thing following (in 'last') is a branch
3536            or not and whther first is the startbranch (ie is it a sub part of
3537            the alternation or is it the whole thing.)
3538            Assuming its a sub part we convert the EXACT otherwise we convert
3539            the whole branch sequence, including the first.
3540          */
3541         /* Find the node we are going to overwrite */
3542         if ( first != startbranch || OP( last ) == BRANCH ) {
3543             /* branch sub-chain */
3544             NEXT_OFF( first ) = (U16)(last - first);
3545 #ifdef RE_TRACK_PATTERN_OFFSETS
3546             DEBUG_r({
3547                 mjd_offset= Node_Offset((convert));
3548                 mjd_nodelen= Node_Length((convert));
3549             });
3550 #endif
3551             /* whole branch chain */
3552         }
3553 #ifdef RE_TRACK_PATTERN_OFFSETS
3554         else {
3555             DEBUG_r({
3556                 const  regnode *nop = NEXTOPER( convert );
3557                 mjd_offset= Node_Offset((nop));
3558                 mjd_nodelen= Node_Length((nop));
3559             });
3560         }
3561         DEBUG_OPTIMISE_r(
3562             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3563                 depth+1,
3564                 (UV)mjd_offset, (UV)mjd_nodelen)
3565         );
3566 #endif
3567         /* But first we check to see if there is a common prefix we can
3568            split out as an EXACT and put in front of the TRIE node.  */
3569         trie->startstate= 1;
3570         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3571             /* we want to find the first state that has more than
3572              * one transition, if that state is not the first state
3573              * then we have a common prefix which we can remove.
3574              */
3575             U32 state;
3576             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3577                 U32 ofs = 0;
3578                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3579                                        transition, -1 means none */
3580                 U32 count = 0;
3581                 const U32 base = trie->states[ state ].trans.base;
3582
3583                 /* does this state terminate an alternation? */
3584                 if ( trie->states[state].wordnum )
3585                         count = 1;
3586
3587                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3588                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3589                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3590                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3591                     {
3592                         if ( ++count > 1 ) {
3593                             /* we have more than one transition */
3594                             SV **tmp;
3595                             U8 *ch;
3596                             /* if this is the first state there is no common prefix
3597                              * to extract, so we can exit */
3598                             if ( state == 1 ) break;
3599                             tmp = av_fetch( revcharmap, ofs, 0);
3600                             ch = (U8*)SvPV_nolen_const( *tmp );
3601
3602                             /* if we are on count 2 then we need to initialize the
3603                              * bitmap, and store the previous char if there was one
3604                              * in it*/
3605                             if ( count == 2 ) {
3606                                 /* clear the bitmap */
3607                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3608                                 DEBUG_OPTIMISE_r(
3609                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3610                                         depth+1,
3611                                         (UV)state));
3612                                 if (first_ofs >= 0) {
3613                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3614                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3615
3616                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3617                                     DEBUG_OPTIMISE_r(
3618                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3619                                     );
3620                                 }
3621                             }
3622                             /* store the current firstchar in the bitmap */
3623                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3624                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3625                         }
3626                         first_ofs = ofs;
3627                     }
3628                 }
3629                 if ( count == 1 ) {
3630                     /* This state has only one transition, its transition is part
3631                      * of a common prefix - we need to concatenate the char it
3632                      * represents to what we have so far. */
3633                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3634                     STRLEN len;
3635                     char *ch = SvPV( *tmp, len );
3636                     DEBUG_OPTIMISE_r({
3637                         SV *sv=sv_newmortal();
3638                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3639                             depth+1,
3640                             (UV)state, (UV)first_ofs,
3641                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3642                                 PL_colors[0], PL_colors[1],
3643                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3644                                 PERL_PV_ESCAPE_FIRSTCHAR
3645                             )
3646                         );
3647                     });
3648                     if ( state==1 ) {
3649                         OP( convert ) = nodetype;
3650                         str=STRING(convert);
3651                         setSTR_LEN(convert, 0);
3652                     }
3653                     assert( ( STR_LEN(convert) + len ) < 256 );
3654                     setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3655                     while (len--)
3656                         *str++ = *ch++;
3657                 } else {
3658 #ifdef DEBUGGING
3659                     if (state>1)
3660                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3661 #endif
3662                     break;
3663                 }
3664             }
3665             trie->prefixlen = (state-1);
3666             if (str) {
3667                 regnode *n = convert+NODE_SZ_STR(convert);
3668                 assert( NODE_SZ_STR(convert) <= U16_MAX );
3669                 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3670                 trie->startstate = state;
3671                 trie->minlen -= (state - 1);
3672                 trie->maxlen -= (state - 1);
3673 #ifdef DEBUGGING
3674                /* At least the UNICOS C compiler choked on this
3675                 * being argument to DEBUG_r(), so let's just have
3676                 * it right here. */
3677                if (
3678 #ifdef PERL_EXT_RE_BUILD
3679                    1
3680 #else
3681                    DEBUG_r_TEST
3682 #endif
3683                    ) {
3684                    regnode *fix = convert;
3685                    U32 word = trie->wordcount;
3686 #ifdef RE_TRACK_PATTERN_OFFSETS
3687                    mjd_nodelen++;
3688 #endif
3689                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3690                    while( ++fix < n ) {
3691                        Set_Node_Offset_Length(fix, 0, 0);
3692                    }
3693                    while (word--) {
3694                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3695                        if (tmp) {
3696                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3697                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3698                            else
3699                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3700                        }
3701                    }
3702                }
3703 #endif
3704                 if (trie->maxlen) {
3705                     convert = n;
3706                 } else {
3707                     NEXT_OFF(convert) = (U16)(tail - convert);
3708                     DEBUG_r(optimize= n);
3709                 }
3710             }
3711         }
3712         if (!jumper)
3713             jumper = last;
3714         if ( trie->maxlen ) {
3715             NEXT_OFF( convert ) = (U16)(tail - convert);
3716             ARG_SET( convert, data_slot );
3717             /* Store the offset to the first unabsorbed branch in
3718                jump[0], which is otherwise unused by the jump logic.
3719                We use this when dumping a trie and during optimisation. */
3720             if (trie->jump)
3721                 trie->jump[0] = (U16)(nextbranch - convert);
3722
3723             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3724              *   and there is a bitmap
3725              *   and the first "jump target" node we found leaves enough room
3726              * then convert the TRIE node into a TRIEC node, with the bitmap
3727              * embedded inline in the opcode - this is hypothetically faster.
3728              */
3729             if ( !trie->states[trie->startstate].wordnum
3730                  && trie->bitmap
3731                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3732             {
3733                 OP( convert ) = TRIEC;
3734                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3735                 PerlMemShared_free(trie->bitmap);
3736                 trie->bitmap= NULL;
3737             } else
3738                 OP( convert ) = TRIE;
3739
3740             /* store the type in the flags */
3741             convert->flags = nodetype;
3742             DEBUG_r({
3743             optimize = convert
3744                       + NODE_STEP_REGNODE
3745                       + regarglen[ OP( convert ) ];
3746             });
3747             /* XXX We really should free up the resource in trie now,
3748                    as we won't use them - (which resources?) dmq */
3749         }
3750         /* needed for dumping*/
3751         DEBUG_r(if (optimize) {
3752             regnode *opt = convert;
3753
3754             while ( ++opt < optimize) {
3755                 Set_Node_Offset_Length(opt, 0, 0);
3756             }
3757             /*
3758                 Try to clean up some of the debris left after the
3759                 optimisation.
3760              */
3761             while( optimize < jumper ) {
3762                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3763                 OP( optimize ) = OPTIMIZED;
3764                 Set_Node_Offset_Length(optimize, 0, 0);
3765                 optimize++;
3766             }
3767             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3768         });
3769     } /* end node insert */
3770
3771     /*  Finish populating the prev field of the wordinfo array.  Walk back
3772      *  from each accept state until we find another accept state, and if
3773      *  so, point the first word's .prev field at the second word. If the
3774      *  second already has a .prev field set, stop now. This will be the
3775      *  case either if we've already processed that word's accept state,
3776      *  or that state had multiple words, and the overspill words were
3777      *  already linked up earlier.
3778      */
3779     {
3780         U16 word;
3781         U32 state;
3782         U16 prev;
3783
3784         for (word=1; word <= trie->wordcount; word++) {
3785             prev = 0;
3786             if (trie->wordinfo[word].prev)
3787                 continue;
3788             state = trie->wordinfo[word].accept;
3789             while (state) {
3790                 state = prev_states[state];
3791                 if (!state)
3792                     break;
3793                 prev = trie->states[state].wordnum;
3794                 if (prev)
3795                     break;
3796             }
3797             trie->wordinfo[word].prev = prev;
3798         }
3799         Safefree(prev_states);
3800     }
3801
3802
3803     /* and now dump out the compressed format */
3804     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3805
3806     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3807 #ifdef DEBUGGING
3808     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3809     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3810 #else
3811     SvREFCNT_dec_NN(revcharmap);
3812 #endif
3813     return trie->jump
3814            ? MADE_JUMP_TRIE
3815            : trie->startstate>1
3816              ? MADE_EXACT_TRIE
3817              : MADE_TRIE;
3818 }
3819
3820 STATIC regnode *
3821 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3822 {
3823 /* The Trie is constructed and compressed now so we can build a fail array if
3824  * it's needed
3825
3826    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3827    3.32 in the
3828    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3829    Ullman 1985/88
3830    ISBN 0-201-10088-6
3831
3832    We find the fail state for each state in the trie, this state is the longest
3833    proper suffix of the current state's 'word' that is also a proper prefix of
3834    another word in our trie. State 1 represents the word '' and is thus the
3835    default fail state. This allows the DFA not to have to restart after its
3836    tried and failed a word at a given point, it simply continues as though it
3837    had been matching the other word in the first place.
3838    Consider
3839       'abcdgu'=~/abcdefg|cdgu/
3840    When we get to 'd' we are still matching the first word, we would encounter
3841    'g' which would fail, which would bring us to the state representing 'd' in
3842    the second word where we would try 'g' and succeed, proceeding to match
3843    'cdgu'.
3844  */
3845  /* add a fail transition */
3846     const U32 trie_offset = ARG(source);
3847     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3848     U32 *q;
3849     const U32 ucharcount = trie->uniquecharcount;
3850     const U32 numstates = trie->statecount;
3851     const U32 ubound = trie->lasttrans + ucharcount;
3852     U32 q_read = 0;
3853     U32 q_write = 0;
3854     U32 charid;
3855     U32 base = trie->states[ 1 ].trans.base;
3856     U32 *fail;
3857     reg_ac_data *aho;
3858     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3859     regnode *stclass;
3860     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3861
3862     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3863     PERL_UNUSED_CONTEXT;
3864 #ifndef DEBUGGING
3865     PERL_UNUSED_ARG(depth);
3866 #endif
3867
3868     if ( OP(source) == TRIE ) {
3869         struct regnode_1 *op = (struct regnode_1 *)
3870             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3871         StructCopy(source, op, struct regnode_1);
3872         stclass = (regnode *)op;
3873     } else {
3874         struct regnode_charclass *op = (struct regnode_charclass *)
3875             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3876         StructCopy(source, op, struct regnode_charclass);
3877         stclass = (regnode *)op;
3878     }
3879     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3880
3881     ARG_SET( stclass, data_slot );
3882     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3883     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3884     aho->trie=trie_offset;
3885     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3886     Copy( trie->states, aho->states, numstates, reg_trie_state );
3887     Newx( q, numstates, U32);
3888     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3889     aho->refcount = 1;
3890     fail = aho->fail;
3891     /* initialize fail[0..1] to be 1 so that we always have
3892        a valid final fail state */
3893     fail[ 0 ] = fail[ 1 ] = 1;
3894
3895     for ( charid = 0; charid < ucharcount ; charid++ ) {
3896         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3897         if ( newstate ) {
3898             q[ q_write ] = newstate;
3899             /* set to point at the root */
3900             fail[ q[ q_write++ ] ]=1;
3901         }
3902     }
3903     while ( q_read < q_write) {
3904         const U32 cur = q[ q_read++ % numstates ];
3905         base = trie->states[ cur ].trans.base;
3906
3907         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3908             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3909             if (ch_state) {
3910                 U32 fail_state = cur;
3911                 U32 fail_base;
3912                 do {
3913                     fail_state = fail[ fail_state ];
3914                     fail_base = aho->states[ fail_state ].trans.base;
3915                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3916
3917                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3918                 fail[ ch_state ] = fail_state;
3919                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3920                 {
3921                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3922                 }
3923                 q[ q_write++ % numstates] = ch_state;
3924             }
3925         }
3926     }
3927     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3928        when we fail in state 1, this allows us to use the
3929        charclass scan to find a valid start char. This is based on the principle
3930        that theres a good chance the string being searched contains lots of stuff
3931        that cant be a start char.
3932      */
3933     fail[ 0 ] = fail[ 1 ] = 0;
3934     DEBUG_TRIE_COMPILE_r({
3935         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3936                       depth, (UV)numstates
3937         );
3938         for( q_read=1; q_read<numstates; q_read++ ) {
3939             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3940         }
3941         Perl_re_printf( aTHX_  "\n");
3942     });
3943     Safefree(q);
3944     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3945     return stclass;
3946 }
3947
3948
3949 /* The below joins as many adjacent EXACTish nodes as possible into a single
3950  * one.  The regop may be changed if the node(s) contain certain sequences that
3951  * require special handling.  The joining is only done if:
3952  * 1) there is room in the current conglomerated node to entirely contain the
3953  *    next one.
3954  * 2) they are compatible node types
3955  *
3956  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3957  * these get optimized out
3958  *
3959  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3960  * as possible, even if that means splitting an existing node so that its first
3961  * part is moved to the preceeding node.  This would maximise the efficiency of
3962  * memEQ during matching.
3963  *
3964  * If a node is to match under /i (folded), the number of characters it matches
3965  * can be different than its character length if it contains a multi-character
3966  * fold.  *min_subtract is set to the total delta number of characters of the
3967  * input nodes.
3968  *
3969  * And *unfolded_multi_char is set to indicate whether or not the node contains
3970  * an unfolded multi-char fold.  This happens when it won't be known until
3971  * runtime whether the fold is valid or not; namely
3972  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3973  *      target string being matched against turns out to be UTF-8 is that fold
3974  *      valid; or
3975  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3976  *      runtime.
3977  * (Multi-char folds whose components are all above the Latin1 range are not
3978  * run-time locale dependent, and have already been folded by the time this
3979  * function is called.)
3980  *
3981  * This is as good a place as any to discuss the design of handling these
3982  * multi-character fold sequences.  It's been wrong in Perl for a very long
3983  * time.  There are three code points in Unicode whose multi-character folds
3984  * were long ago discovered to mess things up.  The previous designs for
3985  * dealing with these involved assigning a special node for them.  This
3986  * approach doesn't always work, as evidenced by this example:
3987  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3988  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3989  * would match just the \xDF, it won't be able to handle the case where a
3990  * successful match would have to cross the node's boundary.  The new approach
3991  * that hopefully generally solves the problem generates an EXACTFUP node
3992  * that is "sss" in this case.
3993  *
3994  * It turns out that there are problems with all multi-character folds, and not
3995  * just these three.  Now the code is general, for all such cases.  The
3996  * approach taken is:
3997  * 1)   This routine examines each EXACTFish node that could contain multi-
3998  *      character folded sequences.  Since a single character can fold into
3999  *      such a sequence, the minimum match length for this node is less than
4000  *      the number of characters in the node.  This routine returns in
4001  *      *min_subtract how many characters to subtract from the actual
4002  *      length of the string to get a real minimum match length; it is 0 if
4003  *      there are no multi-char foldeds.  This delta is used by the caller to
4004  *      adjust the min length of the match, and the delta between min and max,
4005  *      so that the optimizer doesn't reject these possibilities based on size
4006  *      constraints.
4007  *
4008  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4009  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
4010  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4011  *      EXACTFU nodes.  The node type of such nodes is then changed to
4012  *      EXACTFUP, indicating it is problematic, and needs careful handling.
4013  *      (The procedures in step 1) above are sufficient to handle this case in
4014  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
4015  *      the only case where there is a possible fold length change in non-UTF-8
4016  *      patterns.  By reserving a special node type for problematic cases, the
4017  *      far more common regular EXACTFU nodes can be processed faster.
4018  *      regexec.c takes advantage of this.
4019  *
4020  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4021  *      problematic cases.   These all only occur when the pattern is not
4022  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
4023  *      length change, it handles the situation where the string cannot be
4024  *      entirely folded.  The strings in an EXACTFish node are folded as much
4025  *      as possible during compilation in regcomp.c.  This saves effort in
4026  *      regex matching.  By using an EXACTFUP node when it is not possible to
4027  *      fully fold at compile time, regexec.c can know that everything in an
4028  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
4029  *      case where folding in EXACTFU nodes can't be done at compile time is
4030  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
4031  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
4032  *      handle two very different cases.  Alternatively, there could have been
4033  *      a node type where there are length changes, one for unfolded, and one
4034  *      for both.  If yet another special case needed to be created, the number
4035  *      of required node types would have to go to 7.  khw figures that even
4036  *      though there are plenty of node types to spare, that the maintenance
4037  *      cost wasn't worth the small speedup of doing it that way, especially
4038  *      since he thinks the MICRO SIGN is rarely encountered in practice.
4039  *
4040  *      There are other cases where folding isn't done at compile time, but
4041  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
4042  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
4043  *      changes.  Some folds in EXACTF depend on if the runtime target string
4044  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
4045  *      when no fold in it depends on the UTF-8ness of the target string.)
4046  *
4047  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
4048  *      validity of the fold won't be known until runtime, and so must remain
4049  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
4050  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
4051  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
4052  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4053  *      The reason this is a problem is that the optimizer part of regexec.c
4054  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4055  *      that a character in the pattern corresponds to at most a single
4056  *      character in the target string.  (And I do mean character, and not byte
4057  *      here, unlike other parts of the documentation that have never been
4058  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
4059  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4060  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
4061  *      EXACTFL nodes, violate the assumption, and they are the only instances
4062  *      where it is violated.  I'm reluctant to try to change the assumption,
4063  *      as the code involved is impenetrable to me (khw), so instead the code
4064  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
4065  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4066  *      boolean indicating whether or not the node contains such a fold.  When
4067  *      it is true, the caller sets a flag that later causes the optimizer in
4068  *      this file to not set values for the floating and fixed string lengths,
4069  *      and thus avoids the optimizer code in regexec.c that makes the invalid
4070  *      assumption.  Thus, there is no optimization based on string lengths for
4071  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4072  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
4073  *      assumption is wrong only in these cases is that all other non-UTF-8
4074  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4075  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
4076  *      EXACTF nodes because we don't know at compile time if it actually
4077  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
4078  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
4079  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
4080  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4081  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4082  *      string would require the pattern to be forced into UTF-8, the overhead
4083  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
4084  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4085  *      locale.)
4086  *
4087  *      Similarly, the code that generates tries doesn't currently handle
4088  *      not-already-folded multi-char folds, and it looks like a pain to change
4089  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
4090  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
4091  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
4092  *      using /iaa matching will be doing so almost entirely with ASCII
4093  *      strings, so this should rarely be encountered in practice */
4094
4095 STATIC U32
4096 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4097                    UV *min_subtract, bool *unfolded_multi_char,
4098                    U32 flags, regnode *val, U32 depth)
4099 {
4100     /* Merge several consecutive EXACTish nodes into one. */
4101
4102     regnode *n = regnext(scan);
4103     U32 stringok = 1;
4104     regnode *next = scan + NODE_SZ_STR(scan);
4105     U32 merged = 0;
4106     U32 stopnow = 0;
4107 #ifdef DEBUGGING
4108     regnode *stop = scan;
4109     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4110 #else
4111     PERL_UNUSED_ARG(depth);
4112 #endif
4113
4114     PERL_ARGS_ASSERT_JOIN_EXACT;
4115 #ifndef EXPERIMENTAL_INPLACESCAN
4116     PERL_UNUSED_ARG(flags);
4117     PERL_UNUSED_ARG(val);
4118 #endif
4119     DEBUG_PEEP("join", scan, depth, 0);
4120
4121     assert(PL_regkind[OP(scan)] == EXACT);
4122
4123     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4124      * EXACT ones that are mergeable to the current one. */
4125     while (    n
4126            && (    PL_regkind[OP(n)] == NOTHING
4127                || (stringok && PL_regkind[OP(n)] == EXACT))
4128            && NEXT_OFF(n)
4129            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4130     {
4131
4132         if (OP(n) == TAIL || n > next)
4133             stringok = 0;
4134         if (PL_regkind[OP(n)] == NOTHING) {
4135             DEBUG_PEEP("skip:", n, depth, 0);
4136             NEXT_OFF(scan) += NEXT_OFF(n);
4137             next = n + NODE_STEP_REGNODE;
4138 #ifdef DEBUGGING
4139             if (stringok)
4140                 stop = n;
4141 #endif
4142             n = regnext(n);
4143         }
4144         else if (stringok) {
4145             const unsigned int oldl = STR_LEN(scan);
4146             regnode * const nnext = regnext(n);
4147
4148             /* XXX I (khw) kind of doubt that this works on platforms (should
4149              * Perl ever run on one) where U8_MAX is above 255 because of lots
4150              * of other assumptions */
4151             /* Don't join if the sum can't fit into a single node */
4152             if (oldl + STR_LEN(n) > U8_MAX)
4153                 break;
4154
4155             /* Joining something that requires UTF-8 with something that
4156              * doesn't, means the result requires UTF-8. */
4157             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4158                 OP(scan) = EXACT_REQ8;
4159             }
4160             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4161                 ;   /* join is compatible, no need to change OP */
4162             }
4163             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4164                 OP(scan) = EXACTFU_REQ8;
4165             }
4166             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4167                 ;   /* join is compatible, no need to change OP */
4168             }
4169             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4170                 ;   /* join is compatible, no need to change OP */
4171             }
4172             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4173
4174                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4175                   * which can join with EXACTFU ones.  We check for this case
4176                   * here.  These need to be resolved to either EXACTFU or
4177                   * EXACTF at joining time.  They have nothing in them that
4178                   * would forbid them from being the more desirable EXACTFU
4179                   * nodes except that they begin and/or end with a single [Ss].
4180                   * The reason this is problematic is because they could be
4181                   * joined in this loop with an adjacent node that ends and/or
4182                   * begins with [Ss] which would then form the sequence 'ss',
4183                   * which matches differently under /di than /ui, in which case
4184                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4185                   * formed, the nodes get absorbed into any adjacent EXACTFU
4186                   * node.  And if the only adjacent node is EXACTF, they get
4187                   * absorbed into that, under the theory that a longer node is
4188                   * better than two shorter ones, even if one is EXACTFU.  Note
4189                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4190                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4191
4192                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4193
4194                     /* Here the joined node would end with 's'.  If the node
4195                      * following the combination is an EXACTF one, it's better to
4196                      * join this trailing edge 's' node with that one, leaving the
4197                      * current one in 'scan' be the more desirable EXACTFU */
4198                     if (OP(nnext) == EXACTF) {
4199                         break;
4200                     }
4201
4202                     OP(scan) = EXACTFU_S_EDGE;
4203
4204                 }   /* Otherwise, the beginning 's' of the 2nd node just
4205                        becomes an interior 's' in 'scan' */
4206             }
4207             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4208                 ;   /* join is compatible, no need to change OP */
4209             }
4210             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4211
4212                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4213                  * nodes.  But the latter nodes can be also joined with EXACTFU
4214                  * ones, and that is a better outcome, so if the node following
4215                  * 'n' is EXACTFU, quit now so that those two can be joined
4216                  * later */
4217                 if (OP(nnext) == EXACTFU) {
4218                     break;
4219                 }
4220
4221                 /* The join is compatible, and the combined node will be
4222                  * EXACTF.  (These don't care if they begin or end with 's' */
4223             }
4224             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4225                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4226                     && STRING(n)[0] == 's')
4227                 {
4228                     /* When combined, we have the sequence 'ss', which means we
4229                      * have to remain /di */
4230                     OP(scan) = EXACTF;
4231                 }
4232             }
4233             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4234                 if (STRING(n)[0] == 's') {
4235                     ;   /* Here the join is compatible and the combined node
4236                            starts with 's', no need to change OP */
4237                 }
4238                 else {  /* Now the trailing 's' is in the interior */
4239                     OP(scan) = EXACTFU;
4240                 }
4241             }
4242             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4243
4244                 /* The join is compatible, and the combined node will be
4245                  * EXACTF.  (These don't care if they begin or end with 's' */
4246                 OP(scan) = EXACTF;
4247             }
4248             else if (OP(scan) != OP(n)) {
4249
4250                 /* The only other compatible joinings are the same node type */
4251                 break;
4252             }
4253
4254             DEBUG_PEEP("merg", n, depth, 0);
4255             merged++;
4256
4257             NEXT_OFF(scan) += NEXT_OFF(n);
4258             assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4259             setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4260             next = n + NODE_SZ_STR(n);
4261             /* Now we can overwrite *n : */
4262             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4263 #ifdef DEBUGGING
4264             stop = next - 1;
4265 #endif
4266             n = nnext;
4267             if (stopnow) break;
4268         }
4269
4270 #ifdef EXPERIMENTAL_INPLACESCAN
4271         if (flags && !NEXT_OFF(n)) {
4272             DEBUG_PEEP("atch", val, depth, 0);
4273             if (reg_off_by_arg[OP(n)]) {
4274                 ARG_SET(n, val - n);
4275             }
4276             else {
4277                 NEXT_OFF(n) = val - n;
4278             }
4279             stopnow = 1;
4280         }
4281 #endif
4282     }
4283
4284     /* This temporary node can now be turned into EXACTFU, and must, as
4285      * regexec.c doesn't handle it */
4286     if (OP(scan) == EXACTFU_S_EDGE) {
4287         OP(scan) = EXACTFU;
4288     }
4289
4290     *min_subtract = 0;
4291     *unfolded_multi_char = FALSE;
4292
4293     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4294      * can now analyze for sequences of problematic code points.  (Prior to
4295      * this final joining, sequences could have been split over boundaries, and
4296      * hence missed).  The sequences only happen in folding, hence for any
4297      * non-EXACT EXACTish node */
4298     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4299         U8* s0 = (U8*) STRING(scan);
4300         U8* s = s0;
4301         U8* s_end = s0 + STR_LEN(scan);
4302
4303         int total_count_delta = 0;  /* Total delta number of characters that
4304                                        multi-char folds expand to */
4305
4306         /* One pass is made over the node's string looking for all the
4307          * possibilities.  To avoid some tests in the loop, there are two main
4308          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4309          * non-UTF-8 */
4310         if (UTF) {
4311             U8* folded = NULL;
4312
4313             if (OP(scan) == EXACTFL) {
4314                 U8 *d;
4315
4316                 /* An EXACTFL node would already have been changed to another
4317                  * node type unless there is at least one character in it that
4318                  * is problematic; likely a character whose fold definition
4319                  * won't be known until runtime, and so has yet to be folded.
4320                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4321                  * to handle the UTF-8 case, we need to create a temporary
4322                  * folded copy using UTF-8 locale rules in order to analyze it.
4323                  * This is because our macros that look to see if a sequence is
4324                  * a multi-char fold assume everything is folded (otherwise the
4325                  * tests in those macros would be too complicated and slow).
4326                  * Note that here, the non-problematic folds will have already
4327                  * been done, so we can just copy such characters.  We actually
4328                  * don't completely fold the EXACTFL string.  We skip the
4329                  * unfolded multi-char folds, as that would just create work
4330                  * below to figure out the size they already are */
4331
4332                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4333                 d = folded;
4334                 while (s < s_end) {
4335                     STRLEN s_len = UTF8SKIP(s);
4336                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4337                         Copy(s, d, s_len, U8);
4338                         d += s_len;
4339                     }
4340                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4341                         *unfolded_multi_char = TRUE;
4342                         Copy(s, d, s_len, U8);
4343                         d += s_len;
4344                     }
4345                     else if (isASCII(*s)) {
4346                         *(d++) = toFOLD(*s);
4347                     }
4348                     else {
4349                         STRLEN len;
4350                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4351                         d += len;
4352                     }
4353                     s += s_len;
4354                 }
4355
4356                 /* Point the remainder of the routine to look at our temporary
4357                  * folded copy */
4358                 s = folded;
4359                 s_end = d;
4360             } /* End of creating folded copy of EXACTFL string */
4361
4362             /* Examine the string for a multi-character fold sequence.  UTF-8
4363              * patterns have all characters pre-folded by the time this code is
4364              * executed */
4365             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4366                                      length sequence we are looking for is 2 */
4367             {
4368                 int count = 0;  /* How many characters in a multi-char fold */
4369                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4370                 if (! len) {    /* Not a multi-char fold: get next char */
4371                     s += UTF8SKIP(s);
4372                     continue;
4373                 }
4374
4375                 { /* Here is a generic multi-char fold. */
4376                     U8* multi_end  = s + len;
4377
4378                     /* Count how many characters are in it.  In the case of
4379                      * /aa, no folds which contain ASCII code points are
4380                      * allowed, so check for those, and skip if found. */
4381                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4382                         count = utf8_length(s, multi_end);
4383                         s = multi_end;
4384                     }
4385                     else {
4386                         while (s < multi_end) {
4387                             if (isASCII(*s)) {
4388                                 s++;
4389                                 goto next_iteration;
4390                             }
4391                             else {
4392                                 s += UTF8SKIP(s);
4393                             }
4394                             count++;
4395                         }
4396                     }
4397                 }
4398
4399                 /* The delta is how long the sequence is minus 1 (1 is how long
4400                  * the character that folds to the sequence is) */
4401                 total_count_delta += count - 1;
4402               next_iteration: ;
4403             }
4404
4405             /* We created a temporary folded copy of the string in EXACTFL
4406              * nodes.  Therefore we need to be sure it doesn't go below zero,
4407              * as the real string could be shorter */
4408             if (OP(scan) == EXACTFL) {
4409                 int total_chars = utf8_length((U8*) STRING(scan),
4410                                            (U8*) STRING(scan) + STR_LEN(scan));
4411                 if (total_count_delta > total_chars) {
4412                     total_count_delta = total_chars;
4413                 }
4414             }
4415
4416             *min_subtract += total_count_delta;
4417             Safefree(folded);
4418         }
4419         else if (OP(scan) == EXACTFAA) {
4420
4421             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4422              * fold to the ASCII range (and there are no existing ones in the
4423              * upper latin1 range).  But, as outlined in the comments preceding
4424              * this function, we need to flag any occurrences of the sharp s.
4425              * This character forbids trie formation (because of added
4426              * complexity) */
4427 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4428    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4429                                       || UNICODE_DOT_DOT_VERSION > 0)
4430             while (s < s_end) {
4431                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4432                     OP(scan) = EXACTFAA_NO_TRIE;
4433                     *unfolded_multi_char = TRUE;
4434                     break;
4435                 }
4436                 s++;
4437             }
4438         }
4439         else if (OP(scan) != EXACTFAA_NO_TRIE) {
4440
4441             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4442              * folds that are all Latin1.  As explained in the comments
4443              * preceding this function, we look also for the sharp s in EXACTF
4444              * and EXACTFL nodes; it can be in the final position.  Otherwise
4445              * we can stop looking 1 byte earlier because have to find at least
4446              * two characters for a multi-fold */
4447             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4448                               ? s_end
4449                               : s_end -1;
4450
4451             while (s < upper) {
4452                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4453                 if (! len) {    /* Not a multi-char fold. */
4454                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4455                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4456                     {
4457                         *unfolded_multi_char = TRUE;
4458                     }
4459                     s++;
4460                     continue;
4461                 }
4462
4463                 if (len == 2
4464                     && isALPHA_FOLD_EQ(*s, 's')
4465                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4466                 {
4467
4468                     /* EXACTF nodes need to know that the minimum length
4469                      * changed so that a sharp s in the string can match this
4470                      * ss in the pattern, but they remain EXACTF nodes, as they
4471                      * won't match this unless the target string is in UTF-8,
4472                      * which we don't know until runtime.  EXACTFL nodes can't
4473                      * transform into EXACTFU nodes */
4474                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4475                         OP(scan) = EXACTFUP;
4476                     }
4477                 }
4478
4479                 *min_subtract += len - 1;
4480                 s += len;
4481             }
4482 #endif
4483         }
4484     }
4485
4486 #ifdef DEBUGGING
4487     /* Allow dumping but overwriting the collection of skipped
4488      * ops and/or strings with fake optimized ops */
4489     n = scan + NODE_SZ_STR(scan);
4490     while (n <= stop) {
4491         OP(n) = OPTIMIZED;
4492         FLAGS(n) = 0;
4493         NEXT_OFF(n) = 0;
4494         n++;
4495     }
4496 #endif
4497     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4498     return stopnow;
4499 }
4500
4501 /* REx optimizer.  Converts nodes into quicker variants "in place".
4502    Finds fixed substrings.  */
4503
4504 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4505    to the position after last scanned or to NULL. */
4506
4507 #define INIT_AND_WITHP \
4508     assert(!and_withp); \
4509     Newx(and_withp, 1, regnode_ssc); \
4510     SAVEFREEPV(and_withp)
4511
4512
4513 static void
4514 S_unwind_scan_frames(pTHX_ const void *p)
4515 {
4516     scan_frame *f= (scan_frame *)p;
4517     do {
4518         scan_frame *n= f->next_frame;
4519         Safefree(f);
4520         f= n;
4521     } while (f);
4522 }
4523
4524 /* Follow the next-chain of the current node and optimize away
4525    all the NOTHINGs from it.
4526  */
4527 STATIC void
4528 S_rck_elide_nothing(pTHX_ regnode *node)
4529 {
4530     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4531
4532     if (OP(node) != CURLYX) {
4533         const int max = (reg_off_by_arg[OP(node)]
4534                         ? I32_MAX
4535                           /* I32 may be smaller than U16 on CRAYs! */
4536                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4537         int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4538         int noff;
4539         regnode *n = node;
4540
4541         /* Skip NOTHING and LONGJMP. */
4542         while (
4543             (n = regnext(n))
4544             && (
4545                 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4546                 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4547             )
4548             && off + noff < max
4549         ) {
4550             off += noff;
4551         }
4552         if (reg_off_by_arg[OP(node)])
4553             ARG(node) = off;
4554         else
4555             NEXT_OFF(node) = off;
4556     }
4557     return;
4558 }
4559
4560 /* the return from this sub is the minimum length that could possibly match */
4561 STATIC SSize_t
4562 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4563                         SSize_t *minlenp, SSize_t *deltap,
4564                         regnode *last,
4565                         scan_data_t *data,
4566                         I32 stopparen,
4567                         U32 recursed_depth,
4568                         regnode_ssc *and_withp,
4569                         U32 flags, U32 depth, bool was_mutate_ok)
4570                         /* scanp: Start here (read-write). */
4571                         /* deltap: Write maxlen-minlen here. */
4572                         /* last: Stop before this one. */
4573                         /* data: string data about the pattern */
4574                         /* stopparen: treat close N as END */
4575                         /* recursed: which subroutines have we recursed into */
4576                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4577 {
4578     SSize_t final_minlen;
4579     /* There must be at least this number of characters to match */
4580     SSize_t min = 0;
4581     I32 pars = 0, code;
4582     regnode *scan = *scanp, *next;
4583     SSize_t delta = 0;
4584     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4585     int is_inf_internal = 0;            /* The studied chunk is infinite */
4586     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4587     scan_data_t data_fake;
4588     SV *re_trie_maxbuff = NULL;
4589     regnode *first_non_open = scan;
4590     SSize_t stopmin = OPTIMIZE_INFTY;
4591     scan_frame *frame = NULL;
4592     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4593
4594     PERL_ARGS_ASSERT_STUDY_CHUNK;
4595     RExC_study_started= 1;
4596
4597     Zero(&data_fake, 1, scan_data_t);
4598
4599     if ( depth == 0 ) {
4600         while (first_non_open && OP(first_non_open) == OPEN)
4601             first_non_open=regnext(first_non_open);
4602     }
4603
4604
4605   fake_study_recurse:
4606     DEBUG_r(
4607         RExC_study_chunk_recursed_count++;
4608     );
4609     DEBUG_OPTIMISE_MORE_r(
4610     {
4611         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4612             depth, (long)stopparen,
4613             (unsigned long)RExC_study_chunk_recursed_count,
4614             (unsigned long)depth, (unsigned long)recursed_depth,
4615             scan,
4616             last);
4617         if (recursed_depth) {
4618             U32 i;
4619             U32 j;
4620             for ( j = 0 ; j < recursed_depth ; j++ ) {
4621                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4622                     if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4623                         Perl_re_printf( aTHX_ " %d",(int)i);
4624                         break;
4625                     }
4626                 }
4627                 if ( j + 1 < recursed_depth ) {
4628                     Perl_re_printf( aTHX_  ",");
4629                 }
4630             }
4631         }
4632         Perl_re_printf( aTHX_ "\n");
4633     }
4634     );
4635     while ( scan && OP(scan) != END && scan < last ){
4636         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4637                                    node length to get a real minimum (because
4638                                    the folded version may be shorter) */
4639         bool unfolded_multi_char = FALSE;
4640         /* avoid mutating ops if we are anywhere within the recursed or
4641          * enframed handling for a GOSUB: the outermost level will handle it.
4642          */
4643         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4644         /* Peephole optimizer: */
4645         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4646         DEBUG_PEEP("Peep", scan, depth, flags);
4647
4648
4649         /* The reason we do this here is that we need to deal with things like
4650          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4651          * parsing code, as each (?:..) is handled by a different invocation of
4652          * reg() -- Yves
4653          */
4654         if (PL_regkind[OP(scan)] == EXACT
4655             && OP(scan) != LEXACT
4656             && OP(scan) != LEXACT_REQ8
4657             && mutate_ok
4658         ) {
4659             join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4660                     0, NULL, depth + 1);
4661         }
4662
4663         /* Follow the next-chain of the current node and optimize
4664            away all the NOTHINGs from it.
4665          */
4666         rck_elide_nothing(scan);
4667
4668         /* The principal pseudo-switch.  Cannot be a switch, since we look into
4669          * several different things.  */
4670         if ( OP(scan) == DEFINEP ) {
4671             SSize_t minlen = 0;
4672             SSize_t deltanext = 0;
4673             SSize_t fake_last_close = 0;
4674             I32 f = SCF_IN_DEFINE;
4675
4676             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4677             scan = regnext(scan);
4678             assert( OP(scan) == IFTHEN );
4679             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4680
4681             data_fake.last_closep= &fake_last_close;
4682             minlen = *minlenp;
4683             next = regnext(scan);
4684             scan = NEXTOPER(NEXTOPER(scan));
4685             DEBUG_PEEP("scan", scan, depth, flags);
4686             DEBUG_PEEP("next", next, depth, flags);
4687
4688             /* we suppose the run is continuous, last=next...
4689              * NOTE we dont use the return here! */
4690             /* DEFINEP study_chunk() recursion */
4691             (void)study_chunk(pRExC_state, &scan, &minlen,
4692                               &deltanext, next, &data_fake, stopparen,
4693                               recursed_depth, NULL, f, depth+1, mutate_ok);
4694
4695             scan = next;
4696         } else
4697         if (
4698             OP(scan) == BRANCH  ||
4699             OP(scan) == BRANCHJ ||
4700             OP(scan) == IFTHEN
4701         ) {
4702             next = regnext(scan);
4703             code = OP(scan);
4704
4705             /* The op(next)==code check below is to see if we
4706              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4707              * IFTHEN is special as it might not appear in pairs.
4708              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4709              * we dont handle it cleanly. */
4710             if (OP(next) == code || code == IFTHEN) {
4711                 /* NOTE - There is similar code to this block below for
4712                  * handling TRIE nodes on a re-study.  If you change stuff here
4713                  * check there too. */
4714                 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4715                 regnode_ssc accum;
4716                 regnode * const startbranch=scan;
4717
4718                 if (flags & SCF_DO_SUBSTR) {
4719                     /* Cannot merge strings after this. */
4720                     scan_commit(pRExC_state, data, minlenp, is_inf);
4721                 }
4722
4723                 if (flags & SCF_DO_STCLASS)
4724                     ssc_init_zero(pRExC_state, &accum);
4725
4726                 while (OP(scan) == code) {
4727                     SSize_t deltanext, minnext, fake;
4728                     I32 f = 0;
4729                     regnode_ssc this_class;
4730
4731                     DEBUG_PEEP("Branch", scan, depth, flags);
4732
4733                     num++;
4734                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4735                     if (data) {
4736                         data_fake.whilem_c = data->whilem_c;
4737                         data_fake.last_closep = data->last_closep;
4738                     }
4739                     else
4740                         data_fake.last_closep = &fake;
4741
4742                     data_fake.pos_delta = delta;
4743                     next = regnext(scan);
4744
4745                     scan = NEXTOPER(scan); /* everything */
4746                     if (code != BRANCH)    /* everything but BRANCH */
4747                         scan = NEXTOPER(scan);
4748
4749                     if (flags & SCF_DO_STCLASS) {
4750                         ssc_init(pRExC_state, &this_class);
4751                         data_fake.start_class = &this_class;
4752                         f = SCF_DO_STCLASS_AND;
4753                     }
4754                     if (flags & SCF_WHILEM_VISITED_POS)
4755                         f |= SCF_WHILEM_VISITED_POS;
4756
4757                     /* we suppose the run is continuous, last=next...*/
4758                     /* recurse study_chunk() for each BRANCH in an alternation */
4759                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4760                                       &deltanext, next, &data_fake, stopparen,
4761                                       recursed_depth, NULL, f, depth+1,
4762                                       mutate_ok);
4763
4764                     if (min1 > minnext)
4765                         min1 = minnext;
4766                     if (deltanext == OPTIMIZE_INFTY) {
4767                         is_inf = is_inf_internal = 1;
4768                         max1 = OPTIMIZE_INFTY;
4769                     } else if (max1 < minnext + deltanext)
4770                         max1 = minnext + deltanext;
4771                     scan = next;
4772                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4773                         pars++;
4774                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4775                         if ( stopmin > minnext)
4776                             stopmin = min + min1;
4777                         flags &= ~SCF_DO_SUBSTR;
4778                         if (data)
4779                             data->flags |= SCF_SEEN_ACCEPT;
4780                     }
4781                     if (data) {
4782                         if (data_fake.flags & SF_HAS_EVAL)
4783                             data->flags |= SF_HAS_EVAL;
4784                         data->whilem_c = data_fake.whilem_c;
4785                     }
4786                     if (flags & SCF_DO_STCLASS)
4787                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4788                 }
4789                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4790                     min1 = 0;
4791                 if (flags & SCF_DO_SUBSTR) {
4792                     data->pos_min += min1;
4793                     if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4794                         data->pos_delta = OPTIMIZE_INFTY;
4795                     else
4796                         data->pos_delta += max1 - min1;
4797                     if (max1 != min1 || is_inf)
4798                         data->cur_is_floating = 1;
4799                 }
4800                 min += min1;
4801                 if (delta == OPTIMIZE_INFTY
4802                  || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4803                     delta = OPTIMIZE_INFTY;
4804                 else
4805                     delta += max1 - min1;
4806                 if (flags & SCF_DO_STCLASS_OR) {
4807                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4808                     if (min1) {
4809                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4810                         flags &= ~SCF_DO_STCLASS;
4811                     }
4812                 }
4813                 else if (flags & SCF_DO_STCLASS_AND) {
4814                     if (min1) {
4815                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4816                         flags &= ~SCF_DO_STCLASS;
4817                     }
4818                     else {
4819                         /* Switch to OR mode: cache the old value of
4820                          * data->start_class */
4821                         INIT_AND_WITHP;
4822                         StructCopy(data->start_class, and_withp, regnode_ssc);
4823                         flags &= ~SCF_DO_STCLASS_AND;
4824                         StructCopy(&accum, data->start_class, regnode_ssc);
4825                         flags |= SCF_DO_STCLASS_OR;
4826                     }
4827                 }
4828
4829                 if (PERL_ENABLE_TRIE_OPTIMISATION
4830                     && OP(startbranch) == BRANCH
4831                     && mutate_ok
4832                 ) {
4833                 /* demq.
4834
4835                    Assuming this was/is a branch we are dealing with: 'scan'
4836                    now points at the item that follows the branch sequence,
4837                    whatever it is. We now start at the beginning of the
4838                    sequence and look for subsequences of
4839
4840                    BRANCH->EXACT=>x1
4841                    BRANCH->EXACT=>x2
4842                    tail
4843
4844                    which would be constructed from a pattern like
4845                    /A|LIST|OF|WORDS/
4846
4847                    If we can find such a subsequence we need to turn the first
4848                    element into a trie and then add the subsequent branch exact
4849                    strings to the trie.
4850
4851                    We have two cases
4852
4853                      1. patterns where the whole set of branches can be
4854                         converted.
4855
4856                      2. patterns where only a subset can be converted.
4857
4858                    In case 1 we can replace the whole set with a single regop
4859                    for the trie. In case 2 we need to keep the start and end
4860                    branches so
4861
4862                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4863                      becomes BRANCH TRIE; BRANCH X;
4864
4865                   There is an additional case, that being where there is a
4866                   common prefix, which gets split out into an EXACT like node
4867                   preceding the TRIE node.
4868
4869                   If x(1..n)==tail then we can do a simple trie, if not we make
4870                   a "jump" trie, such that when we match the appropriate word
4871                   we "jump" to the appropriate tail node. Essentially we turn
4872                   a nested if into a case structure of sorts.
4873
4874                 */
4875
4876                     int made=0;
4877                     if (!re_trie_maxbuff) {
4878                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4879                         if (!SvIOK(re_trie_maxbuff))
4880                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4881                     }
4882                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4883                         regnode *cur;
4884                         regnode *first = (regnode *)NULL;
4885                         regnode *prev = (regnode *)NULL;
4886                         regnode *tail = scan;
4887                         U8 trietype = 0;
4888                         U32 count=0;
4889
4890                         /* var tail is used because there may be a TAIL
4891                            regop in the way. Ie, the exacts will point to the
4892                            thing following the TAIL, but the last branch will
4893                            point at the TAIL. So we advance tail. If we
4894                            have nested (?:) we may have to move through several
4895                            tails.
4896                          */
4897
4898                         while ( OP( tail ) == TAIL ) {
4899                             /* this is the TAIL generated by (?:) */
4900                             tail = regnext( tail );
4901                         }
4902
4903
4904                         DEBUG_TRIE_COMPILE_r({
4905                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4906                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4907                               depth+1,
4908                               "Looking for TRIE'able sequences. Tail node is ",
4909                               (UV) REGNODE_OFFSET(tail),
4910                               SvPV_nolen_const( RExC_mysv )
4911                             );
4912                         });
4913
4914                         /*
4915
4916                             Step through the branches
4917                                 cur represents each branch,
4918                                 noper is the first thing to be matched as part
4919                                       of that branch
4920                                 noper_next is the regnext() of that node.
4921
4922                             We normally handle a case like this
4923                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4924                             support building with NOJUMPTRIE, which restricts
4925                             the trie logic to structures like /FOO|BAR/.
4926
4927                             If noper is a trieable nodetype then the branch is
4928                             a possible optimization target. If we are building
4929                             under NOJUMPTRIE then we require that noper_next is
4930                             the same as scan (our current position in the regex
4931                             program).
4932
4933                             Once we have two or more consecutive such branches
4934                             we can create a trie of the EXACT's contents and
4935                             stitch it in place into the program.
4936
4937                             If the sequence represents all of the branches in
4938                             the alternation we replace the entire thing with a
4939                             single TRIE node.
4940
4941                             Otherwise when it is a subsequence we need to
4942                             stitch it in place and replace only the relevant
4943                             branches. This means the first branch has to remain
4944                             as it is used by the alternation logic, and its
4945                             next pointer, and needs to be repointed at the item
4946                             on the branch chain following the last branch we
4947                             have optimized away.
4948
4949                             This could be either a BRANCH, in which case the
4950                             subsequence is internal, or it could be the item
4951                             following the branch sequence in which case the
4952                             subsequence is at the end (which does not
4953                             necessarily mean the first node is the start of the
4954                             alternation).
4955
4956                             TRIE_TYPE(X) is a define which maps the optype to a
4957                             trietype.
4958
4959                                 optype          |  trietype
4960                                 ----------------+-----------
4961                                 NOTHING         | NOTHING
4962                                 EXACT           | EXACT
4963                                 EXACT_REQ8     | EXACT
4964                                 EXACTFU         | EXACTFU
4965                                 EXACTFU_REQ8   | EXACTFU
4966                                 EXACTFUP        | EXACTFU
4967                                 EXACTFAA        | EXACTFAA
4968                                 EXACTL          | EXACTL
4969                                 EXACTFLU8       | EXACTFLU8
4970
4971
4972                         */
4973 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4974                        ? NOTHING                                            \
4975                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
4976                          ? EXACT                                            \
4977                          : (     EXACTFU == (X)                             \
4978                               || EXACTFU_REQ8 == (X)                       \
4979                               || EXACTFUP == (X) )                          \
4980                            ? EXACTFU                                        \
4981                            : ( EXACTFAA == (X) )                            \
4982                              ? EXACTFAA                                     \
4983                              : ( EXACTL == (X) )                            \
4984                                ? EXACTL                                     \
4985                                : ( EXACTFLU8 == (X) )                       \
4986                                  ? EXACTFLU8                                \
4987                                  : 0 )
4988
4989                         /* dont use tail as the end marker for this traverse */
4990                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4991                             regnode * const noper = NEXTOPER( cur );
4992                             U8 noper_type = OP( noper );
4993                             U8 noper_trietype = TRIE_TYPE( noper_type );
4994 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4995                             regnode * const noper_next = regnext( noper );
4996                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4997                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4998 #endif
4999
5000                             DEBUG_TRIE_COMPILE_r({
5001                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5002                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
5003                                    depth+1,
5004                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5005
5006                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5007                                 Perl_re_printf( aTHX_  " -> %d:%s",
5008                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5009
5010                                 if ( noper_next ) {
5011                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5012                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5013                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5014                                 }
5015                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5016                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5017                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5018                                 );
5019                             });
5020
5021                             /* Is noper a trieable nodetype that can be merged
5022                              * with the current trie (if there is one)? */
5023                             if ( noper_trietype
5024                                   &&
5025                                   (
5026                                         ( noper_trietype == NOTHING )
5027                                         || ( trietype == NOTHING )
5028                                         || ( trietype == noper_trietype )
5029                                   )
5030 #ifdef NOJUMPTRIE
5031                                   && noper_next >= tail
5032 #endif
5033                                   && count < U16_MAX)
5034                             {
5035                                 /* Handle mergable triable node Either we are
5036                                  * the first node in a new trieable sequence,
5037                                  * in which case we do some bookkeeping,
5038                                  * otherwise we update the end pointer. */
5039                                 if ( !first ) {
5040                                     first = cur;
5041                                     if ( noper_trietype == NOTHING ) {
5042 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5043                                         regnode * const noper_next = regnext( noper );
5044                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5045                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5046 #endif
5047
5048                                         if ( noper_next_trietype ) {
5049                                             trietype = noper_next_trietype;
5050                                         } else if (noper_next_type)  {
5051                                             /* a NOTHING regop is 1 regop wide.
5052                                              * We need at least two for a trie
5053                                              * so we can't merge this in */
5054                                             first = NULL;
5055                                         }
5056                                     } else {
5057                                         trietype = noper_trietype;
5058                                     }
5059                                 } else {
5060                                     if ( trietype == NOTHING )
5061                                         trietype = noper_trietype;
5062                                     prev = cur;
5063                                 }
5064                                 if (first)
5065                                     count++;
5066                             } /* end handle mergable triable node */
5067                             else {
5068                                 /* handle unmergable node -
5069                                  * noper may either be a triable node which can
5070                                  * not be tried together with the current trie,
5071                                  * or a non triable node */
5072                                 if ( prev ) {
5073                                     /* If last is set and trietype is not
5074                                      * NOTHING then we have found at least two
5075                                      * triable branch sequences in a row of a
5076                                      * similar trietype so we can turn them
5077                                      * into a trie. If/when we allow NOTHING to
5078                                      * start a trie sequence this condition
5079                                      * will be required, and it isn't expensive
5080                                      * so we leave it in for now. */
5081                                     if ( trietype && trietype != NOTHING )
5082                                         make_trie( pRExC_state,
5083                                                 startbranch, first, cur, tail,
5084                                                 count, trietype, depth+1 );
5085                                     prev = NULL; /* note: we clear/update
5086                                                     first, trietype etc below,
5087                                                     so we dont do it here */
5088                                 }
5089                                 if ( noper_trietype
5090 #ifdef NOJUMPTRIE
5091                                      && noper_next >= tail
5092 #endif
5093                                 ){
5094                                     /* noper is triable, so we can start a new
5095                                      * trie sequence */
5096                                     count = 1;
5097                                     first = cur;
5098                                     trietype = noper_trietype;
5099                                 } else if (first) {
5100                                     /* if we already saw a first but the
5101                                      * current node is not triable then we have
5102                                      * to reset the first information. */
5103                                     count = 0;
5104                                     first = NULL;
5105                                     trietype = 0;
5106                                 }
5107                             } /* end handle unmergable node */
5108                         } /* loop over branches */
5109                         DEBUG_TRIE_COMPILE_r({
5110                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5111                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5112                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5113                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5114                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5115                                PL_reg_name[trietype]
5116                             );
5117
5118                         });
5119                         if ( prev && trietype ) {
5120                             if ( trietype != NOTHING ) {
5121                                 /* the last branch of the sequence was part of
5122                                  * a trie, so we have to construct it here
5123                                  * outside of the loop */
5124                                 made= make_trie( pRExC_state, startbranch,
5125                                                  first, scan, tail, count,
5126                                                  trietype, depth+1 );
5127 #ifdef TRIE_STUDY_OPT
5128                                 if ( ((made == MADE_EXACT_TRIE &&
5129                                      startbranch == first)
5130                                      || ( first_non_open == first )) &&
5131                                      depth==0 ) {
5132                                     flags |= SCF_TRIE_RESTUDY;
5133                                     if ( startbranch == first
5134                                          && scan >= tail )
5135                                     {
5136                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5137                                     }
5138                                 }
5139 #endif
5140                             } else {
5141                                 /* at this point we know whatever we have is a
5142                                  * NOTHING sequence/branch AND if 'startbranch'
5143                                  * is 'first' then we can turn the whole thing
5144                                  * into a NOTHING
5145                                  */
5146                                 if ( startbranch == first ) {
5147                                     regnode *opt;
5148                                     /* the entire thing is a NOTHING sequence,
5149                                      * something like this: (?:|) So we can
5150                                      * turn it into a plain NOTHING op. */
5151                                     DEBUG_TRIE_COMPILE_r({
5152                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5153                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5154                                           depth+1,
5155                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5156
5157                                     });
5158                                     OP(startbranch)= NOTHING;
5159                                     NEXT_OFF(startbranch)= tail - startbranch;
5160                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5161                                         OP(opt)= OPTIMIZED;
5162                                 }
5163                             }
5164                         } /* end if ( prev) */
5165                     } /* TRIE_MAXBUF is non zero */
5166                 } /* do trie */
5167
5168             }
5169             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5170                 scan = NEXTOPER(NEXTOPER(scan));
5171             } else                      /* single branch is optimized. */
5172                 scan = NEXTOPER(scan);
5173             continue;
5174         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5175             I32 paren = 0;
5176             regnode *start = NULL;
5177             regnode *end = NULL;
5178             U32 my_recursed_depth= recursed_depth;
5179
5180             if (OP(scan) != SUSPEND) { /* GOSUB */
5181                 /* Do setup, note this code has side effects beyond
5182                  * the rest of this block. Specifically setting
5183                  * RExC_recurse[] must happen at least once during
5184                  * study_chunk(). */
5185                 paren = ARG(scan);
5186                 RExC_recurse[ARG2L(scan)] = scan;
5187                 start = REGNODE_p(RExC_open_parens[paren]);
5188                 end   = REGNODE_p(RExC_close_parens[paren]);
5189
5190                 /* NOTE we MUST always execute the above code, even
5191                  * if we do nothing with a GOSUB */
5192                 if (
5193                     ( flags & SCF_IN_DEFINE )
5194                     ||
5195                     (
5196                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5197                         &&
5198                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5199                     )
5200                 ) {
5201                     /* no need to do anything here if we are in a define. */
5202                     /* or we are after some kind of infinite construct
5203                      * so we can skip recursing into this item.
5204                      * Since it is infinite we will not change the maxlen
5205                      * or delta, and if we miss something that might raise
5206                      * the minlen it will merely pessimise a little.
5207                      *
5208                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5209                      * might result in a minlen of 1 and not of 4,
5210                      * but this doesn't make us mismatch, just try a bit
5211                      * harder than we should.
5212                      *
5213                      * However we must assume this GOSUB is infinite, to
5214                      * avoid wrongly applying other optimizations in the
5215                      * enclosing scope - see GH 18096, for example.
5216                      */
5217                     is_inf = is_inf_internal = 1;
5218                     scan= regnext(scan);
5219                     continue;
5220                 }
5221
5222                 if (
5223                     !recursed_depth
5224                     || !PAREN_TEST(recursed_depth - 1, paren)
5225                 ) {
5226                     /* it is quite possible that there are more efficient ways
5227                      * to do this. We maintain a bitmap per level of recursion
5228                      * of which patterns we have entered so we can detect if a
5229                      * pattern creates a possible infinite loop. When we
5230                      * recurse down a level we copy the previous levels bitmap
5231                      * down. When we are at recursion level 0 we zero the top
5232                      * level bitmap. It would be nice to implement a different
5233                      * more efficient way of doing this. In particular the top
5234                      * level bitmap may be unnecessary.
5235                      */
5236                     if (!recursed_depth) {
5237                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5238                     } else {
5239                         Copy(PAREN_OFFSET(recursed_depth - 1),
5240                              PAREN_OFFSET(recursed_depth),
5241                              RExC_study_chunk_recursed_bytes, U8);
5242                     }
5243                     /* we havent recursed into this paren yet, so recurse into it */
5244                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5245                     PAREN_SET(recursed_depth, paren);
5246                     my_recursed_depth= recursed_depth + 1;
5247                 } else {
5248                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5249                     /* some form of infinite recursion, assume infinite length
5250                      * */
5251                     if (flags & SCF_DO_SUBSTR) {
5252                         scan_commit(pRExC_state, data, minlenp, is_inf);
5253                         data->cur_is_floating = 1;
5254                     }
5255                     is_inf = is_inf_internal = 1;
5256                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5257                         ssc_anything(data->start_class);
5258                     flags &= ~SCF_DO_STCLASS;
5259
5260                     start= NULL; /* reset start so we dont recurse later on. */
5261                 }
5262             } else {
5263                 paren = stopparen;
5264                 start = scan + 2;
5265                 end = regnext(scan);
5266             }
5267             if (start) {
5268                 scan_frame *newframe;
5269                 assert(end);
5270                 if (!RExC_frame_last) {
5271                     Newxz(newframe, 1, scan_frame);
5272                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5273                     RExC_frame_head= newframe;
5274                     RExC_frame_count++;
5275                 } else if (!RExC_frame_last->next_frame) {
5276                     Newxz(newframe, 1, scan_frame);
5277                     RExC_frame_last->next_frame= newframe;
5278                     newframe->prev_frame= RExC_frame_last;
5279                     RExC_frame_count++;
5280                 } else {
5281                     newframe= RExC_frame_last->next_frame;
5282                 }
5283                 RExC_frame_last= newframe;
5284
5285                 newframe->next_regnode = regnext(scan);
5286                 newframe->last_regnode = last;
5287                 newframe->stopparen = stopparen;
5288                 newframe->prev_recursed_depth = recursed_depth;
5289                 newframe->this_prev_frame= frame;
5290                 newframe->in_gosub = (
5291                     (frame && frame->in_gosub) || OP(scan) == GOSUB
5292                 );
5293
5294                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5295                 DEBUG_PEEP("fnew", scan, depth, flags);
5296
5297                 frame = newframe;
5298                 scan =  start;
5299                 stopparen = paren;
5300                 last = end;
5301                 depth = depth + 1;
5302                 recursed_depth= my_recursed_depth;
5303
5304                 continue;
5305             }
5306         }
5307         else if (PL_regkind[OP(scan)] == EXACT && ! isEXACTFish(OP(scan))) {
5308             SSize_t bytelen = STR_LEN(scan), charlen;
5309             UV uc;
5310             assert(bytelen);
5311             if (UTF) {
5312                 const U8 * const s = (U8*)STRING(scan);
5313                 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5314                 charlen = utf8_length(s, s + bytelen);
5315             } else {
5316                 uc = *((U8*)STRING(scan));
5317                 charlen = bytelen;
5318             }
5319             min += charlen;
5320             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5321                 /* The code below prefers earlier match for fixed
5322                    offset, later match for variable offset.  */
5323                 if (data->last_end == -1) { /* Update the start info. */
5324                     data->last_start_min = data->pos_min;
5325                     data->last_start_max =
5326                         is_inf ? OPTIMIZE_INFTY
5327                         : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5328                             ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5329                 }
5330                 sv_catpvn(data->last_found, STRING(scan), bytelen);
5331                 if (UTF)
5332                     SvUTF8_on(data->last_found);
5333                 {
5334                     SV * const sv = data->last_found;
5335                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5336                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5337                     if (mg && mg->mg_len >= 0)
5338                         mg->mg_len += charlen;
5339                 }
5340                 data->last_end = data->pos_min + charlen;
5341                 data->pos_min += charlen; /* As in the first entry. */
5342                 data->flags &= ~SF_BEFORE_EOL;
5343             }
5344
5345             /* ANDing the code point leaves at most it, and not in locale, and
5346              * can't match null string */
5347             if (flags & SCF_DO_STCLASS_AND) {
5348                 ssc_cp_and(data->start_class, uc);
5349                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5350                 ssc_clear_locale(data->start_class);
5351             }
5352             else if (flags & SCF_DO_STCLASS_OR) {
5353                 ssc_add_cp(data->start_class, uc);
5354                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5355
5356                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5357                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5358             }
5359             flags &= ~SCF_DO_STCLASS;
5360         }
5361         else if (PL_regkind[OP(scan)] == EXACT) {
5362             /* But OP != EXACT!, so is EXACTFish */
5363             SSize_t bytelen = STR_LEN(scan), charlen;
5364             const U8 * s = (U8*)STRING(scan);
5365
5366             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5367              * with the mask set to the complement of the bit that differs
5368              * between upper and lower case, and the lowest code point of the
5369              * pair (which the '&' forces) */
5370             if (     bytelen == 1
5371                 &&   isALPHA_A(*s)
5372                 &&  (         OP(scan) == EXACTFAA
5373                      || (     OP(scan) == EXACTFU
5374                          && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5375                 &&   mutate_ok
5376             ) {
5377                 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5378
5379                 OP(scan) = ANYOFM;
5380                 ARG_SET(scan, *s & mask);
5381                 FLAGS(scan) = mask;
5382                 /* we're not EXACTFish any more, so restudy */
5383                 continue;
5384             }
5385
5386             /* Search for fixed substrings supports EXACT only. */
5387             if (flags & SCF_DO_SUBSTR) {
5388                 assert(data);
5389                 scan_commit(pRExC_state, data, minlenp, is_inf);
5390             }
5391             charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5392             if (unfolded_multi_char) {
5393                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5394             }
5395             min += charlen - min_subtract;
5396             assert (min >= 0);
5397             delta += min_subtract;
5398             if (flags & SCF_DO_SUBSTR) {
5399                 data->pos_min += charlen - min_subtract;
5400                 if (data->pos_min < 0) {
5401                     data->pos_min = 0;
5402                 }
5403                 data->pos_delta += min_subtract;
5404                 if (min_subtract) {
5405                     data->cur_is_floating = 1; /* float */
5406                 }
5407             }
5408
5409             if (flags & SCF_DO_STCLASS) {
5410                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5411
5412                 assert(EXACTF_invlist);
5413                 if (flags & SCF_DO_STCLASS_AND) {
5414                     if (OP(scan) != EXACTFL)
5415                         ssc_clear_locale(data->start_class);
5416                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5417                     ANYOF_POSIXL_ZERO(data->start_class);
5418                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5419                 }
5420                 else {  /* SCF_DO_STCLASS_OR */
5421                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5422                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5423
5424                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5425                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5426                 }
5427                 flags &= ~SCF_DO_STCLASS;
5428                 SvREFCNT_dec(EXACTF_invlist);
5429             }
5430         }
5431         else if (REGNODE_VARIES(OP(scan))) {
5432             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5433             I32 fl = 0, f = flags;
5434             regnode * const oscan = scan;
5435             regnode_ssc this_class;
5436             regnode_ssc *oclass = NULL;
5437             I32 next_is_eval = 0;
5438
5439             switch (PL_regkind[OP(scan)]) {
5440             case WHILEM:                /* End of (?:...)* . */
5441                 scan = NEXTOPER(scan);
5442                 goto finish;
5443             case PLUS:
5444                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5445                     next = NEXTOPER(scan);
5446                     if (   (     PL_regkind[OP(next)] == EXACT
5447                             && ! isEXACTFish(OP(next)))
5448                         || (flags & SCF_DO_STCLASS))
5449                     {
5450                         mincount = 1;
5451                         maxcount = REG_INFTY;
5452                         next = regnext(scan);
5453                         scan = NEXTOPER(scan);
5454                         goto do_curly;
5455                     }
5456                 }
5457                 if (flags & SCF_DO_SUBSTR)
5458                     data->pos_min++;
5459                 /* This will bypass the formal 'min += minnext * mincount'
5460                  * calculation in the do_curly path, so assumes min width
5461                  * of the PLUS payload is exactly one. */
5462                 min++;
5463                 /* FALLTHROUGH */
5464             case STAR:
5465                 next = NEXTOPER(scan);
5466
5467                 /* This temporary node can now be turned into EXACTFU, and
5468                  * must, as regexec.c doesn't handle it */
5469                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5470                     OP(next) = EXACTFU;
5471                 }
5472
5473                 if (     STR_LEN(next) == 1
5474                     &&   isALPHA_A(* STRING(next))
5475                     && (         OP(next) == EXACTFAA
5476                         || (     OP(next) == EXACTFU
5477                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5478                     &&   mutate_ok
5479                 ) {
5480                     /* These differ in just one bit */
5481                     U8 mask = ~ ('A' ^ 'a');
5482
5483                     assert(isALPHA_A(* STRING(next)));
5484
5485                     /* Then replace it by an ANYOFM node, with
5486                     * the mask set to the complement of the
5487                     * bit that differs between upper and lower
5488                     * case, and the lowest code point of the
5489                     * pair (which the '&' forces) */
5490                     OP(next) = ANYOFM;
5491                     ARG_SET(next, *STRING(next) & mask);
5492                     FLAGS(next) = mask;
5493                 }
5494
5495                 if (flags & SCF_DO_STCLASS) {
5496                     mincount = 0;
5497                     maxcount = REG_INFTY;
5498                     next = regnext(scan);
5499                     scan = NEXTOPER(scan);
5500                     goto do_curly;
5501                 }
5502                 if (flags & SCF_DO_SUBSTR) {
5503                     scan_commit(pRExC_state, data, minlenp, is_inf);
5504                     /* Cannot extend fixed substrings */
5505                     data->cur_is_floating = 1; /* float */
5506                 }
5507                 is_inf = is_inf_internal = 1;
5508                 scan = regnext(scan);
5509                 goto optimize_curly_tail;
5510             case CURLY:
5511                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5512                     && (scan->flags == stopparen))
5513                 {
5514                     mincount = 1;
5515                     maxcount = 1;
5516                 } else {
5517                     mincount = ARG1(scan);
5518                     maxcount = ARG2(scan);
5519                 }
5520                 next = regnext(scan);
5521                 if (OP(scan) == CURLYX) {
5522                     I32 lp = (data ? *(data->last_closep) : 0);
5523                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5524                 }
5525                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5526                 next_is_eval = (OP(scan) == EVAL);
5527               do_curly:
5528                 if (flags & SCF_DO_SUBSTR) {
5529                     if (mincount == 0)
5530                         scan_commit(pRExC_state, data, minlenp, is_inf);
5531                     /* Cannot extend fixed substrings */
5532                     pos_before = data->pos_min;
5533                 }
5534                 if (data) {
5535                     fl = data->flags;
5536                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5537                     if (is_inf)
5538                         data->flags |= SF_IS_INF;
5539                 }
5540                 if (flags & SCF_DO_STCLASS) {
5541                     ssc_init(pRExC_state, &this_class);
5542                     oclass = data->start_class;
5543                     data->start_class = &this_class;
5544                     f |= SCF_DO_STCLASS_AND;
5545                     f &= ~SCF_DO_STCLASS_OR;
5546                 }
5547                 /* Exclude from super-linear cache processing any {n,m}
5548                    regops for which the combination of input pos and regex
5549                    pos is not enough information to determine if a match
5550                    will be possible.
5551
5552                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5553                    regex pos at the \s*, the prospects for a match depend not
5554                    only on the input position but also on how many (bar\s*)
5555                    repeats into the {4,8} we are. */
5556                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5557                     f &= ~SCF_WHILEM_VISITED_POS;
5558
5559                 /* This will finish on WHILEM, setting scan, or on NULL: */
5560                 /* recurse study_chunk() on loop bodies */
5561                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5562                                   last, data, stopparen, recursed_depth, NULL,
5563                                   (mincount == 0
5564                                    ? (f & ~SCF_DO_SUBSTR)
5565                                    : f)
5566                                   , depth+1, mutate_ok);
5567
5568                 if (flags & SCF_DO_STCLASS)
5569                     data->start_class = oclass;
5570                 if (mincount == 0 || minnext == 0) {
5571                     if (flags & SCF_DO_STCLASS_OR) {
5572                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5573                     }
5574                     else if (flags & SCF_DO_STCLASS_AND) {
5575                         /* Switch to OR mode: cache the old value of
5576                          * data->start_class */
5577                         INIT_AND_WITHP;
5578                         StructCopy(data->start_class, and_withp, regnode_ssc);
5579                         flags &= ~SCF_DO_STCLASS_AND;
5580                         StructCopy(&this_class, data->start_class, regnode_ssc);
5581                         flags |= SCF_DO_STCLASS_OR;
5582                         ANYOF_FLAGS(data->start_class)
5583                                                 |= SSC_MATCHES_EMPTY_STRING;
5584                     }
5585                 } else {                /* Non-zero len */
5586                     if (flags & SCF_DO_STCLASS_OR) {
5587                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5588                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5589                     }
5590                     else if (flags & SCF_DO_STCLASS_AND)
5591                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5592                     flags &= ~SCF_DO_STCLASS;
5593                 }
5594                 if (!scan)              /* It was not CURLYX, but CURLY. */
5595                     scan = next;
5596                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5597                     /* ? quantifier ok, except for (?{ ... }) */
5598                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5599                     && (minnext == 0) && (deltanext == 0)
5600                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5601                     && maxcount <= REG_INFTY/3) /* Complement check for big
5602                                                    count */
5603                 {
5604                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5605                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5606                             "Quantifier unexpected on zero-length expression "
5607                             "in regex m/%" UTF8f "/",
5608                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5609                                   RExC_precomp)));
5610                 }
5611
5612                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5613                     || min >= SSize_t_MAX - minnext * mincount )
5614                 {
5615                     FAIL("Regexp out of space");
5616                 }
5617
5618                 min += minnext * mincount;
5619                 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5620                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5621                 is_inf |= is_inf_internal;
5622                 if (is_inf) {
5623                     delta = OPTIMIZE_INFTY;
5624                 } else {
5625                     delta += (minnext + deltanext) * maxcount
5626                              - minnext * mincount;
5627                 }
5628                 /* Try powerful optimization CURLYX => CURLYN. */
5629                 if (  OP(oscan) == CURLYX && data
5630                       && data->flags & SF_IN_PAR
5631                       && !(data->flags & SF_HAS_EVAL)
5632                       && !deltanext && minnext == 1
5633                       && mutate_ok
5634                 ) {
5635                     /* Try to optimize to CURLYN.  */
5636                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5637                     regnode * const nxt1 = nxt;
5638 #ifdef DEBUGGING
5639                     regnode *nxt2;
5640 #endif
5641
5642                     /* Skip open. */
5643                     nxt = regnext(nxt);
5644                     if (!REGNODE_SIMPLE(OP(nxt))
5645                         && !(PL_regkind[OP(nxt)] == EXACT
5646                              && STR_LEN(nxt) == 1))
5647                         goto nogo;
5648 #ifdef DEBUGGING
5649                     nxt2 = nxt;
5650 #endif
5651                     nxt = regnext(nxt);
5652                     if (OP(nxt) != CLOSE)
5653                         goto nogo;
5654                     if (RExC_open_parens) {
5655
5656                         /*open->CURLYM*/
5657                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5658
5659                         /*close->while*/
5660                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5661                     }
5662                     /* Now we know that nxt2 is the only contents: */
5663                     oscan->flags = (U8)ARG(nxt);
5664                     OP(oscan) = CURLYN;
5665                     OP(nxt1) = NOTHING; /* was OPEN. */
5666
5667 #ifdef DEBUGGING
5668                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5669                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5670                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5671                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5672                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5673                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5674 #endif
5675                 }
5676               nogo:
5677
5678                 /* Try optimization CURLYX => CURLYM. */
5679                 if (  OP(oscan) == CURLYX && data
5680                       && !(data->flags & SF_HAS_PAR)
5681                       && !(data->flags & SF_HAS_EVAL)
5682                       && !deltanext     /* atom is fixed width */
5683                       && minnext != 0   /* CURLYM can't handle zero width */
5684                          /* Nor characters whose fold at run-time may be
5685                           * multi-character */
5686                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5687                       && mutate_ok
5688                 ) {
5689                     /* XXXX How to optimize if data == 0? */
5690                     /* Optimize to a simpler form.  */
5691                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5692                     regnode *nxt2;
5693
5694                     OP(oscan) = CURLYM;
5695                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5696                             && (OP(nxt2) != WHILEM))
5697                         nxt = nxt2;
5698                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5699                     /* Need to optimize away parenths. */
5700                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5701                         /* Set the parenth number.  */
5702                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5703
5704                         oscan->flags = (U8)ARG(nxt);
5705                         if (RExC_open_parens) {
5706                              /*open->CURLYM*/
5707                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5708
5709                             /*close->NOTHING*/
5710                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5711                                                          + 1;
5712                         }
5713                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5714                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5715
5716 #ifdef DEBUGGING
5717                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5718                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5719                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5720                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5721 #endif
5722 #if 0
5723                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5724                             regnode *nnxt = regnext(nxt1);
5725                             if (nnxt == nxt) {
5726                                 if (reg_off_by_arg[OP(nxt1)])
5727                                     ARG_SET(nxt1, nxt2 - nxt1);
5728                                 else if (nxt2 - nxt1 < U16_MAX)
5729                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5730                                 else
5731                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5732                             }
5733                             nxt1 = nnxt;
5734                         }
5735 #endif
5736                         /* Optimize again: */
5737                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5738                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5739                                     NULL, stopparen, recursed_depth, NULL, 0,
5740                                     depth+1, mutate_ok);
5741                     }
5742                     else
5743                         oscan->flags = 0;
5744                 }
5745                 else if ((OP(oscan) == CURLYX)
5746                          && (flags & SCF_WHILEM_VISITED_POS)
5747                          /* See the comment on a similar expression above.
5748                             However, this time it's not a subexpression
5749                             we care about, but the expression itself. */
5750                          && (maxcount == REG_INFTY)
5751                          && data) {
5752                     /* This stays as CURLYX, we can put the count/of pair. */
5753                     /* Find WHILEM (as in regexec.c) */
5754                     regnode *nxt = oscan + NEXT_OFF(oscan);
5755
5756                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5757                         nxt += ARG(nxt);
5758                     nxt = PREVOPER(nxt);
5759                     if (nxt->flags & 0xf) {
5760                         /* we've already set whilem count on this node */
5761                     } else if (++data->whilem_c < 16) {
5762                         assert(data->whilem_c <= RExC_whilem_seen);
5763                         nxt->flags = (U8)(data->whilem_c
5764                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5765                     }
5766                 }
5767                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5768                     pars++;
5769                 if (flags & SCF_DO_SUBSTR) {
5770                     SV *last_str = NULL;
5771                     STRLEN last_chrs = 0;
5772                     int counted = mincount != 0;
5773
5774                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5775                                                                   string. */
5776                         SSize_t b = pos_before >= data->last_start_min
5777                             ? pos_before : data->last_start_min;
5778                         STRLEN l;
5779                         const char * const s = SvPV_const(data->last_found, l);
5780                         SSize_t old = b - data->last_start_min;
5781                         assert(old >= 0);
5782
5783                         if (UTF)
5784                             old = utf8_hop_forward((U8*)s, old,
5785                                                (U8 *) SvEND(data->last_found))
5786                                 - (U8*)s;
5787                         l -= old;
5788                         /* Get the added string: */
5789                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5790                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5791                                             (U8*)(s + old + l)) : l;
5792                         if (deltanext == 0 && pos_before == b) {
5793                             /* What was added is a constant string */
5794                             if (mincount > 1) {
5795
5796                                 SvGROW(last_str, (mincount * l) + 1);
5797                                 repeatcpy(SvPVX(last_str) + l,
5798                                           SvPVX_const(last_str), l,
5799                                           mincount - 1);
5800                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5801                                 /* Add additional parts. */
5802                                 SvCUR_set(data->last_found,
5803                                           SvCUR(data->last_found) - l);
5804                                 sv_catsv(data->last_found, last_str);
5805                                 {
5806                                     SV * sv = data->last_found;
5807                                     MAGIC *mg =
5808                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5809                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5810                                     if (mg && mg->mg_len >= 0)
5811                                         mg->mg_len += last_chrs * (mincount-1);
5812                                 }
5813                                 last_chrs *= mincount;
5814                                 data->last_end += l * (mincount - 1);
5815                             }
5816                         } else {
5817                             /* start offset must point into the last copy */
5818                             data->last_start_min += minnext * (mincount - 1);
5819                             data->last_start_max =
5820                               is_inf
5821                                ? OPTIMIZE_INFTY
5822                                : data->last_start_max +
5823                                  (maxcount - 1) * (minnext + data->pos_delta);
5824                         }
5825                     }
5826                     /* It is counted once already... */
5827                     data->pos_min += minnext * (mincount - counted);
5828 #if 0
5829 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5830                               " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5831                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5832     (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5833     (UV)mincount);
5834 if (deltanext != OPTIMIZE_INFTY)
5835 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5836     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5837           - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5838 #endif
5839                     if (deltanext == OPTIMIZE_INFTY
5840                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5841                         data->pos_delta = OPTIMIZE_INFTY;
5842                     else
5843                         data->pos_delta += - counted * deltanext +
5844                         (minnext + deltanext) * maxcount - minnext * mincount;
5845                     if (mincount != maxcount) {
5846                          /* Cannot extend fixed substrings found inside
5847                             the group.  */
5848                         scan_commit(pRExC_state, data, minlenp, is_inf);
5849                         if (mincount && last_str) {
5850                             SV * const sv = data->last_found;
5851                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5852                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5853
5854                             if (mg)
5855                                 mg->mg_len = -1;
5856                             sv_setsv(sv, last_str);
5857                             data->last_end = data->pos_min;
5858                             data->last_start_min = data->pos_min - last_chrs;
5859                             data->last_start_max = is_inf
5860                                 ? OPTIMIZE_INFTY
5861                                 : data->pos_min + data->pos_delta - last_chrs;
5862                         }
5863                         data->cur_is_floating = 1; /* float */
5864                     }
5865                     SvREFCNT_dec(last_str);
5866                 }
5867                 if (data && (fl & SF_HAS_EVAL))
5868                     data->flags |= SF_HAS_EVAL;
5869               optimize_curly_tail:
5870                 rck_elide_nothing(oscan);
5871                 continue;
5872
5873             default:
5874                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5875                                                                     OP(scan));
5876             case REF:
5877             case CLUMP:
5878                 if (flags & SCF_DO_SUBSTR) {
5879                     /* Cannot expect anything... */
5880                     scan_commit(pRExC_state, data, minlenp, is_inf);
5881                     data->cur_is_floating = 1; /* float */
5882                 }
5883                 is_inf = is_inf_internal = 1;
5884                 if (flags & SCF_DO_STCLASS_OR) {
5885                     if (OP(scan) == CLUMP) {
5886                         /* Actually is any start char, but very few code points
5887                          * aren't start characters */
5888                         ssc_match_all_cp(data->start_class);
5889                     }
5890                     else {
5891                         ssc_anything(data->start_class);
5892                     }
5893                 }
5894                 flags &= ~SCF_DO_STCLASS;
5895                 break;
5896             }
5897         }
5898         else if (OP(scan) == LNBREAK) {
5899             if (flags & SCF_DO_STCLASS) {
5900                 if (flags & SCF_DO_STCLASS_AND) {
5901                     ssc_intersection(data->start_class,
5902                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5903                     ssc_clear_locale(data->start_class);
5904                     ANYOF_FLAGS(data->start_class)
5905                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5906                 }
5907                 else if (flags & SCF_DO_STCLASS_OR) {
5908                     ssc_union(data->start_class,
5909                               PL_XPosix_ptrs[_CC_VERTSPACE],
5910                               FALSE);
5911                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5912
5913                     /* See commit msg for
5914                      * 749e076fceedeb708a624933726e7989f2302f6a */
5915                     ANYOF_FLAGS(data->start_class)
5916                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5917                 }
5918                 flags &= ~SCF_DO_STCLASS;
5919             }
5920             min++;
5921             if (delta != OPTIMIZE_INFTY)
5922                 delta++;    /* Because of the 2 char string cr-lf */
5923             if (flags & SCF_DO_SUBSTR) {
5924                 /* Cannot expect anything... */
5925                 scan_commit(pRExC_state, data, minlenp, is_inf);
5926                 data->pos_min += 1;
5927                 if (data->pos_delta != OPTIMIZE_INFTY) {
5928                     data->pos_delta += 1;
5929                 }
5930                 data->cur_is_floating = 1; /* float */
5931             }
5932         }
5933         else if (REGNODE_SIMPLE(OP(scan))) {
5934
5935             if (flags & SCF_DO_SUBSTR) {
5936                 scan_commit(pRExC_state, data, minlenp, is_inf);
5937                 data->pos_min++;
5938             }
5939             min++;
5940             if (flags & SCF_DO_STCLASS) {
5941                 bool invert = 0;
5942                 SV* my_invlist = NULL;
5943                 U8 namedclass;
5944
5945                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5946                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5947
5948                 /* Some of the logic below assumes that switching
5949                    locale on will only add false positives. */
5950                 switch (OP(scan)) {
5951
5952                 default:
5953 #ifdef DEBUGGING
5954                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5955                                                                      OP(scan));
5956 #endif
5957                 case SANY:
5958                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5959                         ssc_match_all_cp(data->start_class);
5960                     break;
5961
5962                 case REG_ANY:
5963                     {
5964                         SV* REG_ANY_invlist = _new_invlist(2);
5965                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5966                                                             '\n');
5967                         if (flags & SCF_DO_STCLASS_OR) {
5968                             ssc_union(data->start_class,
5969                                       REG_ANY_invlist,
5970                                       TRUE /* TRUE => invert, hence all but \n
5971                                             */
5972                                       );
5973                         }
5974                         else if (flags & SCF_DO_STCLASS_AND) {
5975                             ssc_intersection(data->start_class,
5976                                              REG_ANY_invlist,
5977                                              TRUE  /* TRUE => invert */
5978                                              );
5979                             ssc_clear_locale(data->start_class);
5980                         }
5981                         SvREFCNT_dec_NN(REG_ANY_invlist);
5982                     }
5983                     break;
5984
5985                 case ANYOFD:
5986                 case ANYOFL:
5987                 case ANYOFPOSIXL:
5988                 case ANYOFH:
5989                 case ANYOFHb:
5990                 case ANYOFHr:
5991                 case ANYOFHs:
5992                 case ANYOF:
5993                     if (flags & SCF_DO_STCLASS_AND)
5994                         ssc_and(pRExC_state, data->start_class,
5995                                 (regnode_charclass *) scan);
5996                     else
5997                         ssc_or(pRExC_state, data->start_class,
5998                                                           (regnode_charclass *) scan);
5999                     break;
6000
6001                 case NANYOFM: /* NANYOFM already contains the inversion of the
6002                                  input ANYOF data, so, unlike things like
6003                                  NPOSIXA, don't change 'invert' to TRUE */
6004                     /* FALLTHROUGH */
6005                 case ANYOFM:
6006                   {
6007                     SV* cp_list = get_ANYOFM_contents(scan);
6008
6009                     if (flags & SCF_DO_STCLASS_OR) {
6010                         ssc_union(data->start_class, cp_list, invert);
6011                     }
6012                     else if (flags & SCF_DO_STCLASS_AND) {
6013                         ssc_intersection(data->start_class, cp_list, invert);
6014                     }
6015
6016                     SvREFCNT_dec_NN(cp_list);
6017                     break;
6018                   }
6019
6020                 case ANYOFR:
6021                 case ANYOFRb:
6022                   {
6023                     SV* cp_list = NULL;
6024
6025                     cp_list = _add_range_to_invlist(cp_list,
6026                                         ANYOFRbase(scan),
6027                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
6028
6029                     if (flags & SCF_DO_STCLASS_OR) {
6030                         ssc_union(data->start_class, cp_list, invert);
6031                     }
6032                     else if (flags & SCF_DO_STCLASS_AND) {
6033                         ssc_intersection(data->start_class, cp_list, invert);
6034                     }
6035
6036                     SvREFCNT_dec_NN(cp_list);
6037                     break;
6038                   }
6039
6040                 case NPOSIXL:
6041                     invert = 1;
6042                     /* FALLTHROUGH */
6043
6044                 case POSIXL:
6045                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6046                     if (flags & SCF_DO_STCLASS_AND) {
6047                         bool was_there = cBOOL(
6048                                           ANYOF_POSIXL_TEST(data->start_class,
6049                                                                  namedclass));
6050                         ANYOF_POSIXL_ZERO(data->start_class);
6051                         if (was_there) {    /* Do an AND */
6052                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6053                         }
6054                         /* No individual code points can now match */
6055                         data->start_class->invlist
6056                                                 = sv_2mortal(_new_invlist(0));
6057                     }
6058                     else {
6059                         int complement = namedclass + ((invert) ? -1 : 1);
6060
6061                         assert(flags & SCF_DO_STCLASS_OR);
6062
6063                         /* If the complement of this class was already there,
6064                          * the result is that they match all code points,
6065                          * (\d + \D == everything).  Remove the classes from
6066                          * future consideration.  Locale is not relevant in
6067                          * this case */
6068                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6069                             ssc_match_all_cp(data->start_class);
6070                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6071                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
6072                         }
6073                         else {  /* The usual case; just add this class to the
6074                                    existing set */
6075                             ANYOF_POSIXL_SET(data->start_class, namedclass);
6076                         }
6077                     }
6078                     break;
6079
6080                 case NPOSIXA:   /* For these, we always know the exact set of
6081                                    what's matched */
6082                     invert = 1;
6083                     /* FALLTHROUGH */
6084                 case POSIXA:
6085                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6086                     goto join_posix_and_ascii;
6087
6088                 case NPOSIXD:
6089                 case NPOSIXU:
6090                     invert = 1;
6091                     /* FALLTHROUGH */
6092                 case POSIXD:
6093                 case POSIXU:
6094                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6095
6096                     /* NPOSIXD matches all upper Latin1 code points unless the
6097                      * target string being matched is UTF-8, which is
6098                      * unknowable until match time.  Since we are going to
6099                      * invert, we want to get rid of all of them so that the
6100                      * inversion will match all */
6101                     if (OP(scan) == NPOSIXD) {
6102                         _invlist_subtract(my_invlist, PL_UpperLatin1,
6103                                           &my_invlist);
6104                     }
6105
6106                   join_posix_and_ascii:
6107
6108                     if (flags & SCF_DO_STCLASS_AND) {
6109                         ssc_intersection(data->start_class, my_invlist, invert);
6110                         ssc_clear_locale(data->start_class);
6111                     }
6112                     else {
6113                         assert(flags & SCF_DO_STCLASS_OR);
6114                         ssc_union(data->start_class, my_invlist, invert);
6115                     }
6116                     SvREFCNT_dec(my_invlist);
6117                 }
6118                 if (flags & SCF_DO_STCLASS_OR)
6119                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6120                 flags &= ~SCF_DO_STCLASS;
6121             }
6122         }
6123         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6124             data->flags |= (OP(scan) == MEOL
6125                             ? SF_BEFORE_MEOL
6126                             : SF_BEFORE_SEOL);
6127             scan_commit(pRExC_state, data, minlenp, is_inf);
6128
6129         }
6130         else if (  PL_regkind[OP(scan)] == BRANCHJ
6131                  /* Lookbehind, or need to calculate parens/evals/stclass: */
6132                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
6133                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6134         {
6135             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6136                 || OP(scan) == UNLESSM )
6137             {
6138                 /* Negative Lookahead/lookbehind
6139                    In this case we can't do fixed string optimisation.
6140                 */
6141
6142                 SSize_t deltanext, minnext, fake = 0;
6143                 regnode *nscan;
6144                 regnode_ssc intrnl;
6145                 int f = 0;
6146
6147                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6148                 if (data) {
6149                     data_fake.whilem_c = data->whilem_c;
6150                     data_fake.last_closep = data->last_closep;
6151                 }
6152                 else
6153                     data_fake.last_closep = &fake;
6154                 data_fake.pos_delta = delta;
6155                 if ( flags & SCF_DO_STCLASS && !scan->flags
6156                      && OP(scan) == IFMATCH ) { /* Lookahead */
6157                     ssc_init(pRExC_state, &intrnl);
6158                     data_fake.start_class = &intrnl;
6159                     f |= SCF_DO_STCLASS_AND;
6160                 }
6161                 if (flags & SCF_WHILEM_VISITED_POS)
6162                     f |= SCF_WHILEM_VISITED_POS;
6163                 next = regnext(scan);
6164                 nscan = NEXTOPER(NEXTOPER(scan));
6165
6166                 /* recurse study_chunk() for lookahead body */
6167                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6168                                       last, &data_fake, stopparen,
6169                                       recursed_depth, NULL, f, depth+1,
6170                                       mutate_ok);
6171                 if (scan->flags) {
6172                     if (   deltanext < 0
6173                         || deltanext > (I32) U8_MAX
6174                         || minnext > (I32)U8_MAX
6175                         || minnext + deltanext > (I32)U8_MAX)
6176                     {
6177                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6178                               (UV)U8_MAX);
6179                     }
6180
6181                     /* The 'next_off' field has been repurposed to count the
6182                      * additional starting positions to try beyond the initial
6183                      * one.  (This leaves it at 0 for non-variable length
6184                      * matches to avoid breakage for those not using this
6185                      * extension) */
6186                     if (deltanext) {
6187                         scan->next_off = deltanext;
6188                         ckWARNexperimental(RExC_parse,
6189                             WARN_EXPERIMENTAL__VLB,
6190                             "Variable length lookbehind is experimental");
6191                     }
6192                     scan->flags = (U8)minnext + deltanext;
6193                 }
6194                 if (data) {
6195                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6196                         pars++;
6197                     if (data_fake.flags & SF_HAS_EVAL)
6198                         data->flags |= SF_HAS_EVAL;
6199                     data->whilem_c = data_fake.whilem_c;
6200                 }
6201                 if (f & SCF_DO_STCLASS_AND) {
6202                     if (flags & SCF_DO_STCLASS_OR) {
6203                         /* OR before, AND after: ideally we would recurse with
6204                          * data_fake to get the AND applied by study of the
6205                          * remainder of the pattern, and then derecurse;
6206                          * *** HACK *** for now just treat as "no information".
6207                          * See [perl #56690].
6208                          */
6209                         ssc_init(pRExC_state, data->start_class);
6210                     }  else {
6211                         /* AND before and after: combine and continue.  These
6212                          * assertions are zero-length, so can match an EMPTY
6213                          * string */
6214                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6215                         ANYOF_FLAGS(data->start_class)
6216                                                    |= SSC_MATCHES_EMPTY_STRING;
6217                     }
6218                 }
6219             }
6220 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6221             else {
6222                 /* Positive Lookahead/lookbehind
6223                    In this case we can do fixed string optimisation,
6224                    but we must be careful about it. Note in the case of
6225                    lookbehind the positions will be offset by the minimum
6226                    length of the pattern, something we won't know about
6227                    until after the recurse.
6228                 */
6229                 SSize_t deltanext, fake = 0;
6230                 regnode *nscan;
6231                 regnode_ssc intrnl;
6232                 int f = 0;
6233                 /* We use SAVEFREEPV so that when the full compile
6234                     is finished perl will clean up the allocated
6235                     minlens when it's all done. This way we don't
6236                     have to worry about freeing them when we know
6237                     they wont be used, which would be a pain.
6238                  */
6239                 SSize_t *minnextp;
6240                 Newx( minnextp, 1, SSize_t );
6241                 SAVEFREEPV(minnextp);
6242
6243                 if (data) {
6244                     StructCopy(data, &data_fake, scan_data_t);
6245                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6246                         f |= SCF_DO_SUBSTR;
6247                         if (scan->flags)
6248                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6249                         data_fake.last_found=newSVsv(data->last_found);
6250                     }
6251                 }
6252                 else
6253                     data_fake.last_closep = &fake;
6254                 data_fake.flags = 0;
6255                 data_fake.substrs[0].flags = 0;
6256                 data_fake.substrs[1].flags = 0;
6257                 data_fake.pos_delta = delta;
6258                 if (is_inf)
6259                     data_fake.flags |= SF_IS_INF;
6260                 if ( flags & SCF_DO_STCLASS && !scan->flags
6261                      && OP(scan) == IFMATCH ) { /* Lookahead */
6262                     ssc_init(pRExC_state, &intrnl);
6263                     data_fake.start_class = &intrnl;
6264                     f |= SCF_DO_STCLASS_AND;
6265                 }
6266                 if (flags & SCF_WHILEM_VISITED_POS)
6267                     f |= SCF_WHILEM_VISITED_POS;
6268                 next = regnext(scan);
6269                 nscan = NEXTOPER(NEXTOPER(scan));
6270
6271                 /* positive lookahead study_chunk() recursion */
6272                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6273                                         &deltanext, last, &data_fake,
6274                                         stopparen, recursed_depth, NULL,
6275                                         f, depth+1, mutate_ok);
6276                 if (scan->flags) {
6277                     assert(0);  /* This code has never been tested since this
6278                                    is normally not compiled */
6279                     if (   deltanext < 0
6280                         || deltanext > (I32) U8_MAX
6281                         || *minnextp > (I32)U8_MAX
6282                         || *minnextp + deltanext > (I32)U8_MAX)
6283                     {
6284                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6285                               (UV)U8_MAX);
6286                     }
6287
6288                     if (deltanext) {
6289                         scan->next_off = deltanext;
6290                     }
6291                     scan->flags = (U8)*minnextp + deltanext;
6292                 }
6293
6294                 *minnextp += min;
6295
6296                 if (f & SCF_DO_STCLASS_AND) {
6297                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6298                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6299                 }
6300                 if (data) {
6301                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6302                         pars++;
6303                     if (data_fake.flags & SF_HAS_EVAL)
6304                         data->flags |= SF_HAS_EVAL;
6305                     data->whilem_c = data_fake.whilem_c;
6306                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6307                         int i;
6308                         if (RExC_rx->minlen<*minnextp)
6309                             RExC_rx->minlen=*minnextp;
6310                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6311                         SvREFCNT_dec_NN(data_fake.last_found);
6312
6313                         for (i = 0; i < 2; i++) {
6314                             if (data_fake.substrs[i].minlenp != minlenp) {
6315                                 data->substrs[i].min_offset =
6316                                             data_fake.substrs[i].min_offset;
6317                                 data->substrs[i].max_offset =
6318                                             data_fake.substrs[i].max_offset;
6319                                 data->substrs[i].minlenp =
6320                                             data_fake.substrs[i].minlenp;
6321                                 data->substrs[i].lookbehind += scan->flags;
6322                             }
6323                         }
6324                     }
6325                 }
6326             }
6327 #endif
6328         }
6329         else if (OP(scan) == OPEN) {
6330             if (stopparen != (I32)ARG(scan))
6331                 pars++;
6332         }
6333         else if (OP(scan) == CLOSE) {
6334             if (stopparen == (I32)ARG(scan)) {
6335                 break;
6336             }
6337             if ((I32)ARG(scan) == is_par) {
6338                 next = regnext(scan);
6339
6340                 if ( next && (OP(next) != WHILEM) && next < last)
6341                     is_par = 0;         /* Disable optimization */
6342             }
6343             if (data)
6344                 *(data->last_closep) = ARG(scan);
6345         }
6346         else if (OP(scan) == EVAL) {
6347                 if (data)
6348                     data->flags |= SF_HAS_EVAL;
6349         }
6350         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6351             if (flags & SCF_DO_SUBSTR) {
6352                 scan_commit(pRExC_state, data, minlenp, is_inf);
6353                 flags &= ~SCF_DO_SUBSTR;
6354             }
6355             if (data && OP(scan)==ACCEPT) {
6356                 data->flags |= SCF_SEEN_ACCEPT;
6357                 if (stopmin > min)
6358                     stopmin = min;
6359             }
6360         }
6361         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6362         {
6363                 if (flags & SCF_DO_SUBSTR) {
6364                     scan_commit(pRExC_state, data, minlenp, is_inf);
6365                     data->cur_is_floating = 1; /* float */
6366                 }
6367                 is_inf = is_inf_internal = 1;
6368                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6369                     ssc_anything(data->start_class);
6370                 flags &= ~SCF_DO_STCLASS;
6371         }
6372         else if (OP(scan) == GPOS) {
6373             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6374                 !(delta || is_inf || (data && data->pos_delta)))
6375             {
6376                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6377                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6378                 if (RExC_rx->gofs < (STRLEN)min)
6379                     RExC_rx->gofs = min;
6380             } else {
6381                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6382                 RExC_rx->gofs = 0;
6383             }
6384         }
6385 #ifdef TRIE_STUDY_OPT
6386 #ifdef FULL_TRIE_STUDY
6387         else if (PL_regkind[OP(scan)] == TRIE) {
6388             /* NOTE - There is similar code to this block above for handling
6389                BRANCH nodes on the initial study.  If you change stuff here
6390                check there too. */
6391             regnode *trie_node= scan;
6392             regnode *tail= regnext(scan);
6393             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6394             SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6395             regnode_ssc accum;
6396
6397             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6398                 /* Cannot merge strings after this. */
6399                 scan_commit(pRExC_state, data, minlenp, is_inf);
6400             }
6401             if (flags & SCF_DO_STCLASS)
6402                 ssc_init_zero(pRExC_state, &accum);
6403
6404             if (!trie->jump) {
6405                 min1= trie->minlen;
6406                 max1= trie->maxlen;
6407             } else {
6408                 const regnode *nextbranch= NULL;
6409                 U32 word;
6410
6411                 for ( word=1 ; word <= trie->wordcount ; word++)
6412                 {
6413                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6414                     regnode_ssc this_class;
6415
6416                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6417                     if (data) {
6418                         data_fake.whilem_c = data->whilem_c;
6419                         data_fake.last_closep = data->last_closep;
6420                     }
6421                     else
6422                         data_fake.last_closep = &fake;
6423                     data_fake.pos_delta = delta;
6424                     if (flags & SCF_DO_STCLASS) {
6425                         ssc_init(pRExC_state, &this_class);
6426                         data_fake.start_class = &this_class;
6427                         f = SCF_DO_STCLASS_AND;
6428                     }
6429                     if (flags & SCF_WHILEM_VISITED_POS)
6430                         f |= SCF_WHILEM_VISITED_POS;
6431
6432                     if (trie->jump[word]) {
6433                         if (!nextbranch)
6434                             nextbranch = trie_node + trie->jump[0];
6435                         scan= trie_node + trie->jump[word];
6436                         /* We go from the jump point to the branch that follows
6437                            it. Note this means we need the vestigal unused
6438                            branches even though they arent otherwise used. */
6439                         /* optimise study_chunk() for TRIE */
6440                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6441                             &deltanext, (regnode *)nextbranch, &data_fake,
6442                             stopparen, recursed_depth, NULL, f, depth+1,
6443                             mutate_ok);
6444                     }
6445                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6446                         nextbranch= regnext((regnode*)nextbranch);
6447
6448                     if (min1 > (SSize_t)(minnext + trie->minlen))
6449                         min1 = minnext + trie->minlen;
6450                     if (deltanext == OPTIMIZE_INFTY) {
6451                         is_inf = is_inf_internal = 1;
6452                         max1 = OPTIMIZE_INFTY;
6453                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6454                         max1 = minnext + deltanext + trie->maxlen;
6455
6456                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6457                         pars++;
6458                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6459                         if ( stopmin > min + min1)
6460                             stopmin = min + min1;
6461                         flags &= ~SCF_DO_SUBSTR;
6462                         if (data)
6463                             data->flags |= SCF_SEEN_ACCEPT;
6464                     }
6465                     if (data) {
6466                         if (data_fake.flags & SF_HAS_EVAL)
6467                             data->flags |= SF_HAS_EVAL;
6468                         data->whilem_c = data_fake.whilem_c;
6469                     }
6470                     if (flags & SCF_DO_STCLASS)
6471                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6472                 }
6473             }
6474             if (flags & SCF_DO_SUBSTR) {
6475                 data->pos_min += min1;
6476                 data->pos_delta += max1 - min1;
6477                 if (max1 != min1 || is_inf)
6478                     data->cur_is_floating = 1; /* float */
6479             }
6480             min += min1;
6481             if (delta != OPTIMIZE_INFTY) {
6482                 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6483                     delta += max1 - min1;
6484                 else
6485                     delta = OPTIMIZE_INFTY;
6486             }
6487             if (flags & SCF_DO_STCLASS_OR) {
6488                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6489                 if (min1) {
6490                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6491                     flags &= ~SCF_DO_STCLASS;
6492                 }
6493             }
6494             else if (flags & SCF_DO_STCLASS_AND) {
6495                 if (min1) {
6496                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6497                     flags &= ~SCF_DO_STCLASS;
6498                 }
6499                 else {
6500                     /* Switch to OR mode: cache the old value of
6501                      * data->start_class */
6502                     INIT_AND_WITHP;
6503                     StructCopy(data->start_class, and_withp, regnode_ssc);
6504                     flags &= ~SCF_DO_STCLASS_AND;
6505                     StructCopy(&accum, data->start_class, regnode_ssc);
6506                     flags |= SCF_DO_STCLASS_OR;
6507                 }
6508             }
6509             scan= tail;
6510             continue;
6511         }
6512 #else
6513         else if (PL_regkind[OP(scan)] == TRIE) {
6514             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6515             U8*bang=NULL;
6516
6517             min += trie->minlen;
6518             delta += (trie->maxlen - trie->minlen);
6519             flags &= ~SCF_DO_STCLASS; /* xxx */
6520             if (flags & SCF_DO_SUBSTR) {
6521                 /* Cannot expect anything... */
6522                 scan_commit(pRExC_state, data, minlenp, is_inf);
6523                 data->pos_min += trie->minlen;
6524                 data->pos_delta += (trie->maxlen - trie->minlen);
6525                 if (trie->maxlen != trie->minlen)
6526                     data->cur_is_floating = 1; /* float */
6527             }
6528             if (trie->jump) /* no more substrings -- for now /grr*/
6529                flags &= ~SCF_DO_SUBSTR;
6530         }
6531         else if (OP(scan) == REGEX_SET) {
6532             Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6533                              " before optimization", reg_name[REGEX_SET]);
6534         }
6535
6536 #endif /* old or new */
6537 #endif /* TRIE_STUDY_OPT */
6538
6539         /* Else: zero-length, ignore. */
6540         scan = regnext(scan);
6541     }
6542
6543   finish:
6544     if (frame) {
6545         /* we need to unwind recursion. */
6546         depth = depth - 1;
6547
6548         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6549         DEBUG_PEEP("fend", scan, depth, flags);
6550
6551         /* restore previous context */
6552         last = frame->last_regnode;
6553         scan = frame->next_regnode;
6554         stopparen = frame->stopparen;
6555         recursed_depth = frame->prev_recursed_depth;
6556
6557         RExC_frame_last = frame->prev_frame;
6558         frame = frame->this_prev_frame;
6559         goto fake_study_recurse;
6560     }
6561
6562     assert(!frame);
6563     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6564
6565     *scanp = scan;
6566     *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6567
6568     if (flags & SCF_DO_SUBSTR && is_inf)
6569         data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6570     if (is_par > (I32)U8_MAX)
6571         is_par = 0;
6572     if (is_par && pars==1 && data) {
6573         data->flags |= SF_IN_PAR;
6574         data->flags &= ~SF_HAS_PAR;
6575     }
6576     else if (pars && data) {
6577         data->flags |= SF_HAS_PAR;
6578         data->flags &= ~SF_IN_PAR;
6579     }
6580     if (flags & SCF_DO_STCLASS_OR)
6581         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6582     if (flags & SCF_TRIE_RESTUDY)
6583         data->flags |=  SCF_TRIE_RESTUDY;
6584
6585     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6586
6587     final_minlen = min < stopmin
6588             ? min : stopmin;
6589
6590     if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6591         if (final_minlen > OPTIMIZE_INFTY - delta)
6592             RExC_maxlen = OPTIMIZE_INFTY;
6593         else if (RExC_maxlen < final_minlen + delta)
6594             RExC_maxlen = final_minlen + delta;
6595     }
6596     return final_minlen;
6597 }
6598
6599 STATIC U32
6600 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6601 {
6602     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6603
6604     PERL_ARGS_ASSERT_ADD_DATA;
6605
6606     Renewc(RExC_rxi->data,
6607            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6608            char, struct reg_data);
6609     if(count)
6610         Renew(RExC_rxi->data->what, count + n, U8);
6611     else
6612         Newx(RExC_rxi->data->what, n, U8);
6613     RExC_rxi->data->count = count + n;
6614     Copy(s, RExC_rxi->data->what + count, n, U8);
6615     return count;
6616 }
6617
6618 /*XXX: todo make this not included in a non debugging perl, but appears to be
6619  * used anyway there, in 'use re' */
6620 #ifndef PERL_IN_XSUB_RE
6621 void
6622 Perl_reginitcolors(pTHX)
6623 {
6624     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6625     if (s) {
6626         char *t = savepv(s);
6627         int i = 0;
6628         PL_colors[0] = t;
6629         while (++i < 6) {
6630             t = strchr(t, '\t');
6631             if (t) {
6632                 *t = '\0';
6633                 PL_colors[i] = ++t;
6634             }
6635             else
6636                 PL_colors[i] = t = (char *)"";
6637         }
6638     } else {
6639         int i = 0;
6640         while (i < 6)
6641             PL_colors[i++] = (char *)"";
6642     }
6643     PL_colorset = 1;
6644 }
6645 #endif
6646
6647
6648 #ifdef TRIE_STUDY_OPT
6649 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6650     STMT_START {                                            \
6651         if (                                                \
6652               (data.flags & SCF_TRIE_RESTUDY)               \
6653               && ! restudied++                              \
6654         ) {                                                 \
6655             dOsomething;                                    \
6656             goto reStudy;                                   \
6657         }                                                   \
6658     } STMT_END
6659 #else
6660 #define CHECK_RESTUDY_GOTO_butfirst
6661 #endif
6662
6663 /*
6664  * pregcomp - compile a regular expression into internal code
6665  *
6666  * Decides which engine's compiler to call based on the hint currently in
6667  * scope
6668  */
6669
6670 #ifndef PERL_IN_XSUB_RE
6671
6672 /* return the currently in-scope regex engine (or the default if none)  */
6673
6674 regexp_engine const *
6675 Perl_current_re_engine(pTHX)
6676 {
6677     if (IN_PERL_COMPILETIME) {
6678         HV * const table = GvHV(PL_hintgv);
6679         SV **ptr;
6680
6681         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6682             return &PL_core_reg_engine;
6683         ptr = hv_fetchs(table, "regcomp", FALSE);
6684         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6685             return &PL_core_reg_engine;
6686         return INT2PTR(regexp_engine*, SvIV(*ptr));
6687     }
6688     else {
6689         SV *ptr;
6690         if (!PL_curcop->cop_hints_hash)
6691             return &PL_core_reg_engine;
6692         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6693         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6694             return &PL_core_reg_engine;
6695         return INT2PTR(regexp_engine*, SvIV(ptr));
6696     }
6697 }
6698
6699
6700 REGEXP *
6701 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6702 {
6703     regexp_engine const *eng = current_re_engine();
6704     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6705
6706     PERL_ARGS_ASSERT_PREGCOMP;
6707
6708     /* Dispatch a request to compile a regexp to correct regexp engine. */
6709     DEBUG_COMPILE_r({
6710         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6711                         PTR2UV(eng));
6712     });
6713     return CALLREGCOMP_ENG(eng, pattern, flags);
6714 }
6715 #endif
6716
6717 /* public(ish) entry point for the perl core's own regex compiling code.
6718  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6719  * pattern rather than a list of OPs, and uses the internal engine rather
6720  * than the current one */
6721
6722 REGEXP *
6723 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6724 {
6725     SV *pat = pattern; /* defeat constness! */
6726
6727     PERL_ARGS_ASSERT_RE_COMPILE;
6728
6729     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6730 #ifdef PERL_IN_XSUB_RE
6731                                 &my_reg_engine,
6732 #else
6733                                 &PL_core_reg_engine,
6734 #endif
6735                                 NULL, NULL, rx_flags, 0);
6736 }
6737
6738 static void
6739 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6740 {
6741     int n;
6742
6743     if (--cbs->refcnt > 0)
6744         return;
6745     for (n = 0; n < cbs->count; n++) {
6746         REGEXP *rx = cbs->cb[n].src_regex;
6747         if (rx) {
6748             cbs->cb[n].src_regex = NULL;
6749             SvREFCNT_dec_NN(rx);
6750         }
6751     }
6752     Safefree(cbs->cb);
6753     Safefree(cbs);
6754 }
6755
6756
6757 static struct reg_code_blocks *
6758 S_alloc_code_blocks(pTHX_  int ncode)
6759 {
6760      struct reg_code_blocks *cbs;
6761     Newx(cbs, 1, struct reg_code_blocks);
6762     cbs->count = ncode;
6763     cbs->refcnt = 1;
6764     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6765     if (ncode)
6766         Newx(cbs->cb, ncode, struct reg_code_block);
6767     else
6768         cbs->cb = NULL;
6769     return cbs;
6770 }
6771
6772
6773 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6774  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6775  * point to the realloced string and length.
6776  *
6777  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6778  * stuff added */
6779
6780 static void
6781 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6782                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6783 {
6784     U8 *const src = (U8*)*pat_p;
6785     U8 *dst, *d;
6786     int n=0;
6787     STRLEN s = 0;
6788     bool do_end = 0;
6789     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6790
6791     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6792         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6793
6794     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6795     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6796     d = dst;
6797
6798     while (s < *plen_p) {
6799         append_utf8_from_native_byte(src[s], &d);
6800
6801         if (n < num_code_blocks) {
6802             assert(pRExC_state->code_blocks);
6803             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6804                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6805                 assert(*(d - 1) == '(');
6806                 do_end = 1;
6807             }
6808             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6809                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6810                 assert(*(d - 1) == ')');
6811                 do_end = 0;
6812                 n++;
6813             }
6814         }
6815         s++;
6816     }
6817     *d = '\0';
6818     *plen_p = d - dst;
6819     *pat_p = (char*) dst;
6820     SAVEFREEPV(*pat_p);
6821     RExC_orig_utf8 = RExC_utf8 = 1;
6822 }
6823
6824
6825
6826 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6827  * while recording any code block indices, and handling overloading,
6828  * nested qr// objects etc.  If pat is null, it will allocate a new
6829  * string, or just return the first arg, if there's only one.
6830  *
6831  * Returns the malloced/updated pat.
6832  * patternp and pat_count is the array of SVs to be concatted;
6833  * oplist is the optional list of ops that generated the SVs;
6834  * recompile_p is a pointer to a boolean that will be set if
6835  *   the regex will need to be recompiled.
6836  * delim, if non-null is an SV that will be inserted between each element
6837  */
6838
6839 static SV*
6840 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6841                 SV *pat, SV ** const patternp, int pat_count,
6842                 OP *oplist, bool *recompile_p, SV *delim)
6843 {
6844     SV **svp;
6845     int n = 0;
6846     bool use_delim = FALSE;
6847     bool alloced = FALSE;
6848
6849     /* if we know we have at least two args, create an empty string,
6850      * then concatenate args to that. For no args, return an empty string */
6851     if (!pat && pat_count != 1) {
6852         pat = newSVpvs("");
6853         SAVEFREESV(pat);
6854         alloced = TRUE;
6855     }
6856
6857     for (svp = patternp; svp < patternp + pat_count; svp++) {
6858         SV *sv;
6859         SV *rx  = NULL;
6860         STRLEN orig_patlen = 0;
6861         bool code = 0;
6862         SV *msv = use_delim ? delim : *svp;
6863         if (!msv) msv = &PL_sv_undef;
6864
6865         /* if we've got a delimiter, we go round the loop twice for each
6866          * svp slot (except the last), using the delimiter the second
6867          * time round */
6868         if (use_delim) {
6869             svp--;
6870             use_delim = FALSE;
6871         }
6872         else if (delim)
6873             use_delim = TRUE;
6874
6875         if (SvTYPE(msv) == SVt_PVAV) {
6876             /* we've encountered an interpolated array within
6877              * the pattern, e.g. /...@a..../. Expand the list of elements,
6878              * then recursively append elements.
6879              * The code in this block is based on S_pushav() */
6880
6881             AV *const av = (AV*)msv;
6882             const SSize_t maxarg = AvFILL(av) + 1;
6883             SV **array;
6884
6885             if (oplist) {
6886                 assert(oplist->op_type == OP_PADAV
6887                     || oplist->op_type == OP_RV2AV);
6888                 oplist = OpSIBLING(oplist);
6889             }
6890
6891             if (SvRMAGICAL(av)) {
6892                 SSize_t i;
6893
6894                 Newx(array, maxarg, SV*);
6895                 SAVEFREEPV(array);
6896                 for (i=0; i < maxarg; i++) {
6897                     SV ** const svp = av_fetch(av, i, FALSE);
6898                     array[i] = svp ? *svp : &PL_sv_undef;
6899                 }
6900             }
6901             else
6902                 array = AvARRAY(av);
6903
6904             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6905                                 array, maxarg, NULL, recompile_p,
6906                                 /* $" */
6907                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6908
6909             continue;
6910         }
6911
6912
6913         /* we make the assumption here that each op in the list of
6914          * op_siblings maps to one SV pushed onto the stack,
6915          * except for code blocks, with have both an OP_NULL and
6916          * an OP_CONST.
6917          * This allows us to match up the list of SVs against the
6918          * list of OPs to find the next code block.
6919          *
6920          * Note that       PUSHMARK PADSV PADSV ..
6921          * is optimised to
6922          *                 PADRANGE PADSV  PADSV  ..
6923          * so the alignment still works. */
6924
6925         if (oplist) {
6926             if (oplist->op_type == OP_NULL
6927                 && (oplist->op_flags & OPf_SPECIAL))
6928             {
6929                 assert(n < pRExC_state->code_blocks->count);
6930                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6931                 pRExC_state->code_blocks->cb[n].block = oplist;
6932                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6933                 n++;
6934                 code = 1;
6935                 oplist = OpSIBLING(oplist); /* skip CONST */
6936                 assert(oplist);
6937             }
6938             oplist = OpSIBLING(oplist);;
6939         }
6940
6941         /* apply magic and QR overloading to arg */
6942
6943         SvGETMAGIC(msv);
6944         if (SvROK(msv) && SvAMAGIC(msv)) {
6945             SV *sv = AMG_CALLunary(msv, regexp_amg);
6946             if (sv) {
6947                 if (SvROK(sv))
6948                     sv = SvRV(sv);
6949                 if (SvTYPE(sv) != SVt_REGEXP)
6950                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6951                 msv = sv;
6952             }
6953         }
6954
6955         /* try concatenation overload ... */
6956         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6957                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6958         {
6959             sv_setsv(pat, sv);
6960             /* overloading involved: all bets are off over literal
6961              * code. Pretend we haven't seen it */
6962             if (n)
6963                 pRExC_state->code_blocks->count -= n;
6964             n = 0;
6965         }
6966         else {
6967             /* ... or failing that, try "" overload */
6968             while (SvAMAGIC(msv)
6969                     && (sv = AMG_CALLunary(msv, string_amg))
6970                     && sv != msv
6971                     &&  !(   SvROK(msv)
6972                           && SvROK(sv)
6973                           && SvRV(msv) == SvRV(sv))
6974             ) {
6975                 msv = sv;
6976                 SvGETMAGIC(msv);
6977             }
6978             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6979                 msv = SvRV(msv);
6980
6981             if (pat) {
6982                 /* this is a partially unrolled
6983                  *     sv_catsv_nomg(pat, msv);
6984                  * that allows us to adjust code block indices if
6985                  * needed */
6986                 STRLEN dlen;
6987                 char *dst = SvPV_force_nomg(pat, dlen);
6988                 orig_patlen = dlen;
6989                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6990                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6991                     sv_setpvn(pat, dst, dlen);
6992                     SvUTF8_on(pat);
6993                 }
6994                 sv_catsv_nomg(pat, msv);
6995                 rx = msv;
6996             }
6997             else {
6998                 /* We have only one SV to process, but we need to verify
6999                  * it is properly null terminated or we will fail asserts
7000                  * later. In theory we probably shouldn't get such SV's,
7001                  * but if we do we should handle it gracefully. */
7002                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7003                     /* not a string, or a string with a trailing null */
7004                     pat = msv;
7005                 } else {
7006                     /* a string with no trailing null, we need to copy it
7007                      * so it has a trailing null */
7008                     pat = sv_2mortal(newSVsv(msv));
7009                 }
7010             }
7011
7012             if (code)
7013                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7014         }
7015
7016         /* extract any code blocks within any embedded qr//'s */
7017         if (rx && SvTYPE(rx) == SVt_REGEXP
7018             && RX_ENGINE((REGEXP*)rx)->op_comp)
7019         {
7020
7021             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7022             if (ri->code_blocks && ri->code_blocks->count) {
7023                 int i;
7024                 /* the presence of an embedded qr// with code means
7025                  * we should always recompile: the text of the
7026                  * qr// may not have changed, but it may be a
7027                  * different closure than last time */
7028                 *recompile_p = 1;
7029                 if (pRExC_state->code_blocks) {
7030                     int new_count = pRExC_state->code_blocks->count
7031                             + ri->code_blocks->count;
7032                     Renew(pRExC_state->code_blocks->cb,
7033                             new_count, struct reg_code_block);
7034                     pRExC_state->code_blocks->count = new_count;
7035                 }
7036                 else
7037                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7038                                                     ri->code_blocks->count);
7039
7040                 for (i=0; i < ri->code_blocks->count; i++) {
7041                     struct reg_code_block *src, *dst;
7042                     STRLEN offset =  orig_patlen
7043                         + ReANY((REGEXP *)rx)->pre_prefix;
7044                     assert(n < pRExC_state->code_blocks->count);
7045                     src = &ri->code_blocks->cb[i];
7046                     dst = &pRExC_state->code_blocks->cb[n];
7047                     dst->start      = src->start + offset;
7048                     dst->end        = src->end   + offset;
7049                     dst->block      = src->block;
7050                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
7051                                             src->src_regex
7052                                                 ? src->src_regex
7053                                                 : (REGEXP*)rx);
7054                     n++;
7055                 }
7056             }
7057         }
7058     }
7059     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7060     if (alloced)
7061         SvSETMAGIC(pat);
7062
7063     return pat;
7064 }
7065
7066
7067
7068 /* see if there are any run-time code blocks in the pattern.
7069  * False positives are allowed */
7070
7071 static bool
7072 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7073                     char *pat, STRLEN plen)
7074 {
7075     int n = 0;
7076     STRLEN s;
7077
7078     PERL_UNUSED_CONTEXT;
7079
7080     for (s = 0; s < plen; s++) {
7081         if (   pRExC_state->code_blocks
7082             && n < pRExC_state->code_blocks->count
7083             && s == pRExC_state->code_blocks->cb[n].start)
7084         {
7085             s = pRExC_state->code_blocks->cb[n].end;
7086             n++;
7087             continue;
7088         }
7089         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7090          * positives here */
7091         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7092             (pat[s+2] == '{'
7093                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7094         )
7095             return 1;
7096     }
7097     return 0;
7098 }
7099
7100 /* Handle run-time code blocks. We will already have compiled any direct
7101  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7102  * copy of it, but with any literal code blocks blanked out and
7103  * appropriate chars escaped; then feed it into
7104  *
7105  *    eval "qr'modified_pattern'"
7106  *
7107  * For example,
7108  *
7109  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7110  *
7111  * becomes
7112  *
7113  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7114  *
7115  * After eval_sv()-ing that, grab any new code blocks from the returned qr
7116  * and merge them with any code blocks of the original regexp.
7117  *
7118  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7119  * instead, just save the qr and return FALSE; this tells our caller that
7120  * the original pattern needs upgrading to utf8.
7121  */
7122
7123 static bool
7124 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7125     char *pat, STRLEN plen)
7126 {
7127     SV *qr;
7128
7129     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7130
7131     if (pRExC_state->runtime_code_qr) {
7132         /* this is the second time we've been called; this should
7133          * only happen if the main pattern got upgraded to utf8
7134          * during compilation; re-use the qr we compiled first time
7135          * round (which should be utf8 too)
7136          */
7137         qr = pRExC_state->runtime_code_qr;
7138         pRExC_state->runtime_code_qr = NULL;
7139         assert(RExC_utf8 && SvUTF8(qr));
7140     }
7141     else {
7142         int n = 0;
7143         STRLEN s;
7144         char *p, *newpat;
7145         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7146         SV *sv, *qr_ref;
7147         dSP;
7148
7149         /* determine how many extra chars we need for ' and \ escaping */
7150         for (s = 0; s < plen; s++) {
7151             if (pat[s] == '\'' || pat[s] == '\\')
7152                 newlen++;
7153         }
7154
7155         Newx(newpat, newlen, char);
7156         p = newpat;
7157         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7158
7159         for (s = 0; s < plen; s++) {
7160             if (   pRExC_state->code_blocks
7161                 && n < pRExC_state->code_blocks->count
7162                 && s == pRExC_state->code_blocks->cb[n].start)
7163             {
7164                 /* blank out literal code block so that they aren't
7165                  * recompiled: eg change from/to:
7166                  *     /(?{xyz})/
7167                  *     /(?=====)/
7168                  * and
7169                  *     /(??{xyz})/
7170                  *     /(?======)/
7171                  * and
7172                  *     /(?(?{xyz}))/
7173                  *     /(?(?=====))/
7174                 */
7175                 assert(pat[s]   == '(');
7176                 assert(pat[s+1] == '?');
7177                 *p++ = '(';
7178                 *p++ = '?';
7179                 s += 2;
7180                 while (s < pRExC_state->code_blocks->cb[n].end) {
7181                     *p++ = '=';
7182                     s++;
7183                 }
7184                 *p++ = ')';
7185                 n++;
7186                 continue;
7187             }
7188             if (pat[s] == '\'' || pat[s] == '\\')
7189                 *p++ = '\\';
7190             *p++ = pat[s];
7191         }
7192         *p++ = '\'';
7193         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7194             *p++ = 'x';
7195             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7196                 *p++ = 'x';
7197             }
7198         }
7199         *p++ = '\0';
7200         DEBUG_COMPILE_r({
7201             Perl_re_printf( aTHX_
7202                 "%sre-parsing pattern for runtime code:%s %s\n",
7203                 PL_colors[4], PL_colors[5], newpat);
7204         });
7205
7206         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7207         Safefree(newpat);
7208
7209         ENTER;
7210         SAVETMPS;
7211         save_re_context();
7212         PUSHSTACKi(PERLSI_REQUIRE);
7213         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7214          * parsing qr''; normally only q'' does this. It also alters
7215          * hints handling */
7216         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7217         SvREFCNT_dec_NN(sv);
7218         SPAGAIN;
7219         qr_ref = POPs;
7220         PUTBACK;
7221         {
7222             SV * const errsv = ERRSV;
7223             if (SvTRUE_NN(errsv))
7224                 /* use croak_sv ? */
7225                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7226         }
7227         assert(SvROK(qr_ref));
7228         qr = SvRV(qr_ref);
7229         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7230         /* the leaving below frees the tmp qr_ref.
7231          * Give qr a life of its own */
7232         SvREFCNT_inc(qr);
7233         POPSTACK;
7234         FREETMPS;
7235         LEAVE;
7236
7237     }
7238
7239     if (!RExC_utf8 && SvUTF8(qr)) {
7240         /* first time through; the pattern got upgraded; save the
7241          * qr for the next time through */
7242         assert(!pRExC_state->runtime_code_qr);
7243         pRExC_state->runtime_code_qr = qr;
7244         return 0;
7245     }
7246
7247
7248     /* extract any code blocks within the returned qr//  */
7249
7250
7251     /* merge the main (r1) and run-time (r2) code blocks into one */
7252     {
7253         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7254         struct reg_code_block *new_block, *dst;
7255         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7256         int i1 = 0, i2 = 0;
7257         int r1c, r2c;
7258
7259         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7260         {
7261             SvREFCNT_dec_NN(qr);
7262             return 1;
7263         }
7264
7265         if (!r1->code_blocks)
7266             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7267
7268         r1c = r1->code_blocks->count;
7269         r2c = r2->code_blocks->count;
7270
7271         Newx(new_block, r1c + r2c, struct reg_code_block);
7272
7273         dst = new_block;
7274
7275         while (i1 < r1c || i2 < r2c) {
7276             struct reg_code_block *src;
7277             bool is_qr = 0;
7278
7279             if (i1 == r1c) {
7280                 src = &r2->code_blocks->cb[i2++];
7281                 is_qr = 1;
7282             }
7283             else if (i2 == r2c)
7284                 src = &r1->code_blocks->cb[i1++];
7285             else if (  r1->code_blocks->cb[i1].start
7286                      < r2->code_blocks->cb[i2].start)
7287             {
7288                 src = &r1->code_blocks->cb[i1++];
7289                 assert(src->end < r2->code_blocks->cb[i2].start);
7290             }
7291             else {
7292                 assert(  r1->code_blocks->cb[i1].start
7293                        > r2->code_blocks->cb[i2].start);
7294                 src = &r2->code_blocks->cb[i2++];
7295                 is_qr = 1;
7296                 assert(src->end < r1->code_blocks->cb[i1].start);
7297             }
7298
7299             assert(pat[src->start] == '(');
7300             assert(pat[src->end]   == ')');
7301             dst->start      = src->start;
7302             dst->end        = src->end;
7303             dst->block      = src->block;
7304             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7305                                     : src->src_regex;
7306             dst++;
7307         }
7308         r1->code_blocks->count += r2c;
7309         Safefree(r1->code_blocks->cb);
7310         r1->code_blocks->cb = new_block;
7311     }
7312
7313     SvREFCNT_dec_NN(qr);
7314     return 1;
7315 }
7316
7317
7318 STATIC bool
7319 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7320                       struct reg_substr_datum  *rsd,
7321                       struct scan_data_substrs *sub,
7322                       STRLEN longest_length)
7323 {
7324     /* This is the common code for setting up the floating and fixed length
7325      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7326      * as to whether succeeded or not */
7327
7328     I32 t;
7329     SSize_t ml;
7330     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7331     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7332
7333     if (! (longest_length
7334            || (eol /* Can't have SEOL and MULTI */
7335                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7336           )
7337             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7338         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7339     {
7340         return FALSE;
7341     }
7342
7343     /* copy the information about the longest from the reg_scan_data
7344         over to the program. */
7345     if (SvUTF8(sub->str)) {
7346         rsd->substr      = NULL;
7347         rsd->utf8_substr = sub->str;
7348     } else {
7349         rsd->substr      = sub->str;
7350         rsd->utf8_substr = NULL;
7351     }
7352     /* end_shift is how many chars that must be matched that
7353         follow this item. We calculate it ahead of time as once the
7354         lookbehind offset is added in we lose the ability to correctly
7355         calculate it.*/
7356     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7357     rsd->end_shift = ml - sub->min_offset
7358         - longest_length
7359             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7360              * intead? - DAPM
7361             + (SvTAIL(sub->str) != 0)
7362             */
7363         + sub->lookbehind;
7364
7365     t = (eol/* Can't have SEOL and MULTI */
7366          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7367     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7368
7369     return TRUE;
7370 }
7371
7372 STATIC void
7373 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7374 {
7375     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7376      * properly wrapped with the right modifiers */
7377
7378     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7379     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7380                                                 != REGEX_DEPENDS_CHARSET);
7381
7382     /* The caret is output if there are any defaults: if not all the STD
7383         * flags are set, or if no character set specifier is needed */
7384     bool has_default =
7385                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7386                 || ! has_charset);
7387     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7388                                                 == REG_RUN_ON_COMMENT_SEEN);
7389     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7390                         >> RXf_PMf_STD_PMMOD_SHIFT);
7391     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7392     char *p;
7393     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7394
7395     /* We output all the necessary flags; we never output a minus, as all
7396         * those are defaults, so are
7397         * covered by the caret */
7398     const STRLEN wraplen = pat_len + has_p + has_runon
7399         + has_default       /* If needs a caret */
7400         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7401
7402             /* If needs a character set specifier */
7403         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7404         + (sizeof("(?:)") - 1);
7405
7406     PERL_ARGS_ASSERT_SET_REGEX_PV;
7407
7408     /* make sure PL_bitcount bounds not exceeded */
7409     STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7410
7411     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7412     SvPOK_on(Rx);
7413     if (RExC_utf8)
7414         SvFLAGS(Rx) |= SVf_UTF8;
7415     *p++='('; *p++='?';
7416
7417     /* If a default, cover it using the caret */
7418     if (has_default) {
7419         *p++= DEFAULT_PAT_MOD;
7420     }
7421     if (has_charset) {
7422         STRLEN len;
7423         const char* name;
7424
7425         name = get_regex_charset_name(RExC_rx->extflags, &len);
7426         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7427             assert(RExC_utf8);
7428             name = UNICODE_PAT_MODS;
7429             len = sizeof(UNICODE_PAT_MODS) - 1;
7430         }
7431         Copy(name, p, len, char);
7432         p += len;
7433     }
7434     if (has_p)
7435         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7436     {
7437         char ch;
7438         while((ch = *fptr++)) {
7439             if(reganch & 1)
7440                 *p++ = ch;
7441             reganch >>= 1;
7442         }
7443     }
7444
7445     *p++ = ':';
7446     Copy(RExC_precomp, p, pat_len, char);
7447     assert ((RX_WRAPPED(Rx) - p) < 16);
7448     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7449     p += pat_len;
7450
7451     /* Adding a trailing \n causes this to compile properly:
7452             my $R = qr / A B C # D E/x; /($R)/
7453         Otherwise the parens are considered part of the comment */
7454     if (has_runon)
7455         *p++ = '\n';
7456     *p++ = ')';
7457     *p = 0;
7458     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7459 }
7460
7461 /*
7462  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7463  * regular expression into internal code.
7464  * The pattern may be passed either as:
7465  *    a list of SVs (patternp plus pat_count)
7466  *    a list of OPs (expr)
7467  * If both are passed, the SV list is used, but the OP list indicates
7468  * which SVs are actually pre-compiled code blocks
7469  *
7470  * The SVs in the list have magic and qr overloading applied to them (and
7471  * the list may be modified in-place with replacement SVs in the latter
7472  * case).
7473  *
7474  * If the pattern hasn't changed from old_re, then old_re will be
7475  * returned.
7476  *
7477  * eng is the current engine. If that engine has an op_comp method, then
7478  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7479  * do the initial concatenation of arguments and pass on to the external
7480  * engine.
7481  *
7482  * If is_bare_re is not null, set it to a boolean indicating whether the
7483  * arg list reduced (after overloading) to a single bare regex which has
7484  * been returned (i.e. /$qr/).
7485  *
7486  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7487  *
7488  * pm_flags contains the PMf_* flags, typically based on those from the
7489  * pm_flags field of the related PMOP. Currently we're only interested in
7490  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7491  *
7492  * For many years this code had an initial sizing pass that calculated
7493  * (sometimes incorrectly, leading to security holes) the size needed for the
7494  * compiled pattern.  That was changed by commit
7495  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7496  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7497  * references to this sizing pass.
7498  *
7499  * Now, an initial crude guess as to the size needed is made, based on the
7500  * length of the pattern.  Patches welcome to improve that guess.  That amount
7501  * of space is malloc'd and then immediately freed, and then clawed back node
7502  * by node.  This design is to minimze, to the extent possible, memory churn
7503  * when doing the reallocs.
7504  *
7505  * A separate parentheses counting pass may be needed in some cases.
7506  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7507  * of these cases.
7508  *
7509  * The existence of a sizing pass necessitated design decisions that are no
7510  * longer needed.  There are potential areas of simplification.
7511  *
7512  * Beware that the optimization-preparation code in here knows about some
7513  * of the structure of the compiled regexp.  [I'll say.]
7514  */
7515
7516 REGEXP *
7517 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7518                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7519                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7520 {
7521     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7522     STRLEN plen;
7523     char *exp;
7524     regnode *scan;
7525     I32 flags;
7526     SSize_t minlen = 0;
7527     U32 rx_flags;
7528     SV *pat;
7529     SV** new_patternp = patternp;
7530
7531     /* these are all flags - maybe they should be turned
7532      * into a single int with different bit masks */
7533     I32 sawlookahead = 0;
7534     I32 sawplus = 0;
7535     I32 sawopen = 0;
7536     I32 sawminmod = 0;
7537
7538     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7539     bool recompile = 0;
7540     bool runtime_code = 0;
7541     scan_data_t data;
7542     RExC_state_t RExC_state;
7543     RExC_state_t * const pRExC_state = &RExC_state;
7544 #ifdef TRIE_STUDY_OPT
7545     int restudied = 0;
7546     RExC_state_t copyRExC_state;
7547 #endif
7548     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7549
7550     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7551
7552     DEBUG_r(if (!PL_colorset) reginitcolors());
7553
7554
7555     pRExC_state->warn_text = NULL;
7556     pRExC_state->unlexed_names = NULL;
7557     pRExC_state->code_blocks = NULL;
7558
7559     if (is_bare_re)
7560         *is_bare_re = FALSE;
7561
7562     if (expr && (expr->op_type == OP_LIST ||
7563                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7564         /* allocate code_blocks if needed */
7565         OP *o;
7566         int ncode = 0;
7567
7568         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7569             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7570                 ncode++; /* count of DO blocks */
7571
7572         if (ncode)
7573             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7574     }
7575
7576     if (!pat_count) {
7577         /* compile-time pattern with just OP_CONSTs and DO blocks */
7578
7579         int n;
7580         OP *o;
7581
7582         /* find how many CONSTs there are */
7583         assert(expr);
7584         n = 0;
7585         if (expr->op_type == OP_CONST)
7586             n = 1;
7587         else
7588             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7589                 if (o->op_type == OP_CONST)
7590                     n++;
7591             }
7592
7593         /* fake up an SV array */
7594
7595         assert(!new_patternp);
7596         Newx(new_patternp, n, SV*);
7597         SAVEFREEPV(new_patternp);
7598         pat_count = n;
7599
7600         n = 0;
7601         if (expr->op_type == OP_CONST)
7602             new_patternp[n] = cSVOPx_sv(expr);
7603         else
7604             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7605                 if (o->op_type == OP_CONST)
7606                     new_patternp[n++] = cSVOPo_sv;
7607             }
7608
7609     }
7610
7611     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7612         "Assembling pattern from %d elements%s\n", pat_count,
7613             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7614
7615     /* set expr to the first arg op */
7616
7617     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7618          && expr->op_type != OP_CONST)
7619     {
7620             expr = cLISTOPx(expr)->op_first;
7621             assert(   expr->op_type == OP_PUSHMARK
7622                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7623                    || expr->op_type == OP_PADRANGE);
7624             expr = OpSIBLING(expr);
7625     }
7626
7627     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7628                         expr, &recompile, NULL);
7629
7630     /* handle bare (possibly after overloading) regex: foo =~ $re */
7631     {
7632         SV *re = pat;
7633         if (SvROK(re))
7634             re = SvRV(re);
7635         if (SvTYPE(re) == SVt_REGEXP) {
7636             if (is_bare_re)
7637                 *is_bare_re = TRUE;
7638             SvREFCNT_inc(re);
7639             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7640                 "Precompiled pattern%s\n",
7641                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7642
7643             return (REGEXP*)re;
7644         }
7645     }
7646
7647     exp = SvPV_nomg(pat, plen);
7648
7649     if (!eng->op_comp) {
7650         if ((SvUTF8(pat) && IN_BYTES)
7651                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7652         {
7653             /* make a temporary copy; either to convert to bytes,
7654              * or to avoid repeating get-magic / overloaded stringify */
7655             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7656                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7657         }
7658         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7659     }
7660
7661     /* ignore the utf8ness if the pattern is 0 length */
7662     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7663     RExC_uni_semantics = 0;
7664     RExC_contains_locale = 0;
7665     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7666     RExC_in_script_run = 0;
7667     RExC_study_started = 0;
7668     pRExC_state->runtime_code_qr = NULL;
7669     RExC_frame_head= NULL;
7670     RExC_frame_last= NULL;
7671     RExC_frame_count= 0;
7672     RExC_latest_warn_offset = 0;
7673     RExC_use_BRANCHJ = 0;
7674     RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7675     RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7676     RExC_total_parens = 0;
7677     RExC_open_parens = NULL;
7678     RExC_close_parens = NULL;
7679     RExC_paren_names = NULL;
7680     RExC_size = 0;
7681     RExC_seen_d_op = FALSE;
7682 #ifdef DEBUGGING
7683     RExC_paren_name_list = NULL;
7684 #endif
7685
7686     DEBUG_r({
7687         RExC_mysv1= sv_newmortal();
7688         RExC_mysv2= sv_newmortal();
7689     });
7690
7691     DEBUG_COMPILE_r({
7692             SV *dsv= sv_newmortal();
7693             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7694             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7695                           PL_colors[4], PL_colors[5], s);
7696         });
7697
7698     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7699      * to utf8 */
7700
7701     if ((pm_flags & PMf_USE_RE_EVAL)
7702                 /* this second condition covers the non-regex literal case,
7703                  * i.e.  $foo =~ '(?{})'. */
7704                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7705     )
7706         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7707
7708   redo_parse:
7709     /* return old regex if pattern hasn't changed */
7710     /* XXX: note in the below we have to check the flags as well as the
7711      * pattern.
7712      *
7713      * Things get a touch tricky as we have to compare the utf8 flag
7714      * independently from the compile flags.  */
7715
7716     if (   old_re
7717         && !recompile
7718         && !!RX_UTF8(old_re) == !!RExC_utf8
7719         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7720         && RX_PRECOMP(old_re)
7721         && RX_PRELEN(old_re) == plen
7722         && memEQ(RX_PRECOMP(old_re), exp, plen)
7723         && !runtime_code /* with runtime code, always recompile */ )
7724     {
7725         DEBUG_COMPILE_r({
7726             SV *dsv= sv_newmortal();
7727             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7728             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
7729                           PL_colors[4], PL_colors[5], s);
7730         });
7731         return old_re;
7732     }
7733
7734     /* Allocate the pattern's SV */
7735     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7736     RExC_rx = ReANY(Rx);
7737     if ( RExC_rx == NULL )
7738         FAIL("Regexp out of space");
7739
7740     rx_flags = orig_rx_flags;
7741
7742     if (   toUSE_UNI_CHARSET_NOT_DEPENDS
7743         && initial_charset == REGEX_DEPENDS_CHARSET)
7744     {
7745
7746         /* Set to use unicode semantics if the pattern is in utf8 and has the
7747          * 'depends' charset specified, as it means unicode when utf8  */
7748         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7749         RExC_uni_semantics = 1;
7750     }
7751
7752     RExC_pm_flags = pm_flags;
7753
7754     if (runtime_code) {
7755         assert(TAINTING_get || !TAINT_get);
7756         if (TAINT_get)
7757             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7758
7759         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7760             /* whoops, we have a non-utf8 pattern, whilst run-time code
7761              * got compiled as utf8. Try again with a utf8 pattern */
7762             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7763                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7764             goto redo_parse;
7765         }
7766     }
7767     assert(!pRExC_state->runtime_code_qr);
7768
7769     RExC_sawback = 0;
7770
7771     RExC_seen = 0;
7772     RExC_maxlen = 0;
7773     RExC_in_lookaround = 0;
7774     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7775     RExC_recode_x_to_native = 0;
7776     RExC_in_multi_char_class = 0;
7777
7778     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7779     RExC_precomp_end = RExC_end = exp + plen;
7780     RExC_nestroot = 0;
7781     RExC_whilem_seen = 0;
7782     RExC_end_op = NULL;
7783     RExC_recurse = NULL;
7784     RExC_study_chunk_recursed = NULL;
7785     RExC_study_chunk_recursed_bytes= 0;
7786     RExC_recurse_count = 0;
7787     RExC_sets_depth = 0;
7788     pRExC_state->code_index = 0;
7789
7790     /* Initialize the string in the compiled pattern.  This is so that there is
7791      * something to output if necessary */
7792     set_regex_pv(pRExC_state, Rx);
7793
7794     DEBUG_PARSE_r({
7795         Perl_re_printf( aTHX_
7796             "Starting parse and generation\n");
7797         RExC_lastnum=0;
7798         RExC_lastparse=NULL;
7799     });
7800
7801     /* Allocate space and zero-initialize. Note, the two step process
7802        of zeroing when in debug mode, thus anything assigned has to
7803        happen after that */
7804     if (!  RExC_size) {
7805
7806         /* On the first pass of the parse, we guess how big this will be.  Then
7807          * we grow in one operation to that amount and then give it back.  As
7808          * we go along, we re-allocate what we need.
7809          *
7810          * XXX Currently the guess is essentially that the pattern will be an
7811          * EXACT node with one byte input, one byte output.  This is crude, and
7812          * better heuristics are welcome.
7813          *
7814          * On any subsequent passes, we guess what we actually computed in the
7815          * latest earlier pass.  Such a pass probably didn't complete so is
7816          * missing stuff.  We could improve those guesses by knowing where the
7817          * parse stopped, and use the length so far plus apply the above
7818          * assumption to what's left. */
7819         RExC_size = STR_SZ(RExC_end - RExC_start);
7820     }
7821
7822     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7823     if ( RExC_rxi == NULL )
7824         FAIL("Regexp out of space");
7825
7826     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7827     RXi_SET( RExC_rx, RExC_rxi );
7828
7829     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7830      * node parsed will give back any excess memory we have allocated so far).
7831      * */
7832     RExC_size = 0;
7833
7834     /* non-zero initialization begins here */
7835     RExC_rx->engine= eng;
7836     RExC_rx->extflags = rx_flags;
7837     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7838
7839     if (pm_flags & PMf_IS_QR) {
7840         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7841         if (RExC_rxi->code_blocks) {
7842             RExC_rxi->code_blocks->refcnt++;
7843         }
7844     }
7845
7846     RExC_rx->intflags = 0;
7847
7848     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7849     RExC_parse = exp;
7850
7851     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7852      * code makes sure the final byte is an uncounted NUL.  But should this
7853      * ever not be the case, lots of things could read beyond the end of the
7854      * buffer: loops like
7855      *      while(isFOO(*RExC_parse)) RExC_parse++;
7856      *      strchr(RExC_parse, "foo");
7857      * etc.  So it is worth noting. */
7858     assert(*RExC_end == '\0');
7859
7860     RExC_naughty = 0;
7861     RExC_npar = 1;
7862     RExC_parens_buf_size = 0;
7863     RExC_emit_start = RExC_rxi->program;
7864     pRExC_state->code_index = 0;
7865
7866     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7867     RExC_emit = 1;
7868
7869     /* Do the parse */
7870     if (reg(pRExC_state, 0, &flags, 1)) {
7871
7872         /* Success!, But we may need to redo the parse knowing how many parens
7873          * there actually are */
7874         if (IN_PARENS_PASS) {
7875             flags |= RESTART_PARSE;
7876         }
7877
7878         /* We have that number in RExC_npar */
7879         RExC_total_parens = RExC_npar;
7880     }
7881     else if (! MUST_RESTART(flags)) {
7882         ReREFCNT_dec(Rx);
7883         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7884     }
7885
7886     /* Here, we either have success, or we have to redo the parse for some reason */
7887     if (MUST_RESTART(flags)) {
7888
7889         /* It's possible to write a regexp in ascii that represents Unicode
7890         codepoints outside of the byte range, such as via \x{100}. If we
7891         detect such a sequence we have to convert the entire pattern to utf8
7892         and then recompile, as our sizing calculation will have been based
7893         on 1 byte == 1 character, but we will need to use utf8 to encode
7894         at least some part of the pattern, and therefore must convert the whole
7895         thing.
7896         -- dmq */
7897         if (flags & NEED_UTF8) {
7898
7899             /* We have stored the offset of the final warning output so far.
7900              * That must be adjusted.  Any variant characters between the start
7901              * of the pattern and this warning count for 2 bytes in the final,
7902              * so just add them again */
7903             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7904                 RExC_latest_warn_offset +=
7905                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7906                                                 + RExC_latest_warn_offset);
7907             }
7908             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7909             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7910             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7911         }
7912         else {
7913             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7914         }
7915
7916         if (ALL_PARENS_COUNTED) {
7917             /* Make enough room for all the known parens, and zero it */
7918             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7919             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7920             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7921
7922             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7923             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7924         }
7925         else { /* Parse did not complete.  Reinitialize the parentheses
7926                   structures */
7927             RExC_total_parens = 0;
7928             if (RExC_open_parens) {
7929                 Safefree(RExC_open_parens);
7930                 RExC_open_parens = NULL;
7931             }
7932             if (RExC_close_parens) {
7933                 Safefree(RExC_close_parens);
7934                 RExC_close_parens = NULL;
7935             }
7936         }
7937
7938         /* Clean up what we did in this parse */
7939         SvREFCNT_dec_NN(RExC_rx_sv);
7940
7941         goto redo_parse;
7942     }
7943
7944     /* Here, we have successfully parsed and generated the pattern's program
7945      * for the regex engine.  We are ready to finish things up and look for
7946      * optimizations. */
7947
7948     /* Update the string to compile, with correct modifiers, etc */
7949     set_regex_pv(pRExC_state, Rx);
7950
7951     RExC_rx->nparens = RExC_total_parens - 1;
7952
7953     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7954     if (RExC_whilem_seen > 15)
7955         RExC_whilem_seen = 15;
7956
7957     DEBUG_PARSE_r({
7958         Perl_re_printf( aTHX_
7959             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7960         RExC_lastnum=0;
7961         RExC_lastparse=NULL;
7962     });
7963
7964 #ifdef RE_TRACK_PATTERN_OFFSETS
7965     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7966                           "%s %" UVuf " bytes for offset annotations.\n",
7967                           RExC_offsets ? "Got" : "Couldn't get",
7968                           (UV)((RExC_offsets[0] * 2 + 1))));
7969     DEBUG_OFFSETS_r(if (RExC_offsets) {
7970         const STRLEN len = RExC_offsets[0];
7971         STRLEN i;
7972         DECLARE_AND_GET_RE_DEBUG_FLAGS;
7973         Perl_re_printf( aTHX_
7974                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7975         for (i = 1; i <= len; i++) {
7976             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7977                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7978                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7979         }
7980         Perl_re_printf( aTHX_  "\n");
7981     });
7982
7983 #else
7984     SetProgLen(RExC_rxi,RExC_size);
7985 #endif
7986
7987     DEBUG_DUMP_PRE_OPTIMIZE_r({
7988         SV * const sv = sv_newmortal();
7989         RXi_GET_DECL(RExC_rx, ri);
7990         DEBUG_RExC_seen();
7991         Perl_re_printf( aTHX_ "Program before optimization:\n");
7992
7993         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7994                         sv, 0, 0);
7995     });
7996
7997     DEBUG_OPTIMISE_r(
7998         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7999     );
8000
8001     /* XXXX To minimize changes to RE engine we always allocate
8002        3-units-long substrs field. */
8003     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8004     if (RExC_recurse_count) {
8005         Newx(RExC_recurse, RExC_recurse_count, regnode *);
8006         SAVEFREEPV(RExC_recurse);
8007     }
8008
8009     if (RExC_seen & REG_RECURSE_SEEN) {
8010         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8011          * So its 1 if there are no parens. */
8012         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8013                                          ((RExC_total_parens & 0x07) != 0);
8014         Newx(RExC_study_chunk_recursed,
8015              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8016         SAVEFREEPV(RExC_study_chunk_recursed);
8017     }
8018
8019   reStudy:
8020     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8021     DEBUG_r(
8022         RExC_study_chunk_recursed_count= 0;
8023     );
8024     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8025     if (RExC_study_chunk_recursed) {
8026         Zero(RExC_study_chunk_recursed,
8027              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8028     }
8029
8030
8031 #ifdef TRIE_STUDY_OPT
8032     if (!restudied) {
8033         StructCopy(&zero_scan_data, &data, scan_data_t);
8034         copyRExC_state = RExC_state;
8035     } else {
8036         U32 seen=RExC_seen;
8037         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8038
8039         RExC_state = copyRExC_state;
8040         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8041             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8042         else
8043             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8044         StructCopy(&zero_scan_data, &data, scan_data_t);
8045     }
8046 #else
8047     StructCopy(&zero_scan_data, &data, scan_data_t);
8048 #endif
8049
8050     /* Dig out information for optimizations. */
8051     RExC_rx->extflags = RExC_flags; /* was pm_op */
8052     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8053
8054     if (UTF)
8055         SvUTF8_on(Rx);  /* Unicode in it? */
8056     RExC_rxi->regstclass = NULL;
8057     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
8058         RExC_rx->intflags |= PREGf_NAUGHTY;
8059     scan = RExC_rxi->program + 1;               /* First BRANCH. */
8060
8061     /* testing for BRANCH here tells us whether there is "must appear"
8062        data in the pattern. If there is then we can use it for optimisations */
8063     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
8064                                                   */
8065         SSize_t fake;
8066         STRLEN longest_length[2];
8067         regnode_ssc ch_class; /* pointed to by data */
8068         int stclass_flag;
8069         SSize_t last_close = 0; /* pointed to by data */
8070         regnode *first= scan;
8071         regnode *first_next= regnext(first);
8072         int i;
8073
8074         /*
8075          * Skip introductions and multiplicators >= 1
8076          * so that we can extract the 'meat' of the pattern that must
8077          * match in the large if() sequence following.
8078          * NOTE that EXACT is NOT covered here, as it is normally
8079          * picked up by the optimiser separately.
8080          *
8081          * This is unfortunate as the optimiser isnt handling lookahead
8082          * properly currently.
8083          *
8084          */
8085         while ((OP(first) == OPEN && (sawopen = 1)) ||
8086                /* An OR of *one* alternative - should not happen now. */
8087             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8088             /* for now we can't handle lookbehind IFMATCH*/
8089             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8090             (OP(first) == PLUS) ||
8091             (OP(first) == MINMOD) ||
8092                /* An {n,m} with n>0 */
8093             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8094             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8095         {
8096                 /*
8097                  * the only op that could be a regnode is PLUS, all the rest
8098                  * will be regnode_1 or regnode_2.
8099                  *
8100                  * (yves doesn't think this is true)
8101                  */
8102                 if (OP(first) == PLUS)
8103                     sawplus = 1;
8104                 else {
8105                     if (OP(first) == MINMOD)
8106                         sawminmod = 1;
8107                     first += regarglen[OP(first)];
8108                 }
8109                 first = NEXTOPER(first);
8110                 first_next= regnext(first);
8111         }
8112
8113         /* Starting-point info. */
8114       again:
8115         DEBUG_PEEP("first:", first, 0, 0);
8116         /* Ignore EXACT as we deal with it later. */
8117         if (PL_regkind[OP(first)] == EXACT) {
8118             if (! isEXACTFish(OP(first))) {
8119                 NOOP;   /* Empty, get anchored substr later. */
8120             }
8121             else
8122                 RExC_rxi->regstclass = first;
8123         }
8124 #ifdef TRIE_STCLASS
8125         else if (PL_regkind[OP(first)] == TRIE &&
8126                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8127         {
8128             /* this can happen only on restudy */
8129             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8130         }
8131 #endif
8132         else if (REGNODE_SIMPLE(OP(first)))
8133             RExC_rxi->regstclass = first;
8134         else if (PL_regkind[OP(first)] == BOUND ||
8135                  PL_regkind[OP(first)] == NBOUND)
8136             RExC_rxi->regstclass = first;
8137         else if (PL_regkind[OP(first)] == BOL) {
8138             RExC_rx->intflags |= (OP(first) == MBOL
8139                            ? PREGf_ANCH_MBOL
8140                            : PREGf_ANCH_SBOL);
8141             first = NEXTOPER(first);
8142             goto again;
8143         }
8144         else if (OP(first) == GPOS) {
8145             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8146             first = NEXTOPER(first);
8147             goto again;
8148         }
8149         else if ((!sawopen || !RExC_sawback) &&
8150             !sawlookahead &&
8151             (OP(first) == STAR &&
8152             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8153             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8154         {
8155             /* turn .* into ^.* with an implied $*=1 */
8156             const int type =
8157                 (OP(NEXTOPER(first)) == REG_ANY)
8158                     ? PREGf_ANCH_MBOL
8159                     : PREGf_ANCH_SBOL;
8160             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8161             first = NEXTOPER(first);
8162             goto again;
8163         }
8164         if (sawplus && !sawminmod && !sawlookahead
8165             && (!sawopen || !RExC_sawback)
8166             && !pRExC_state->code_blocks) /* May examine pos and $& */
8167             /* x+ must match at the 1st pos of run of x's */
8168             RExC_rx->intflags |= PREGf_SKIP;
8169
8170         /* Scan is after the zeroth branch, first is atomic matcher. */
8171 #ifdef TRIE_STUDY_OPT
8172         DEBUG_PARSE_r(
8173             if (!restudied)
8174                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8175                               (IV)(first - scan + 1))
8176         );
8177 #else
8178         DEBUG_PARSE_r(
8179             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8180                 (IV)(first - scan + 1))
8181         );
8182 #endif
8183
8184
8185         /*
8186         * If there's something expensive in the r.e., find the
8187         * longest literal string that must appear and make it the
8188         * regmust.  Resolve ties in favor of later strings, since
8189         * the regstart check works with the beginning of the r.e.
8190         * and avoiding duplication strengthens checking.  Not a
8191         * strong reason, but sufficient in the absence of others.
8192         * [Now we resolve ties in favor of the earlier string if
8193         * it happens that c_offset_min has been invalidated, since the
8194         * earlier string may buy us something the later one won't.]
8195         */
8196
8197         data.substrs[0].str = newSVpvs("");
8198         data.substrs[1].str = newSVpvs("");
8199         data.last_found = newSVpvs("");
8200         data.cur_is_floating = 0; /* initially any found substring is fixed */
8201         ENTER_with_name("study_chunk");
8202         SAVEFREESV(data.substrs[0].str);
8203         SAVEFREESV(data.substrs[1].str);
8204         SAVEFREESV(data.last_found);
8205         first = scan;
8206         if (!RExC_rxi->regstclass) {
8207             ssc_init(pRExC_state, &ch_class);
8208             data.start_class = &ch_class;
8209             stclass_flag = SCF_DO_STCLASS_AND;
8210         } else                          /* XXXX Check for BOUND? */
8211             stclass_flag = 0;
8212         data.last_closep = &last_close;
8213
8214         DEBUG_RExC_seen();
8215         /*
8216          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8217          * (NO top level branches)
8218          */
8219         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8220                              scan + RExC_size, /* Up to end */
8221             &data, -1, 0, NULL,
8222             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8223                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8224             0, TRUE);
8225
8226
8227         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8228
8229
8230         if ( RExC_total_parens == 1 && !data.cur_is_floating
8231              && data.last_start_min == 0 && data.last_end > 0
8232              && !RExC_seen_zerolen
8233              && !(RExC_seen & REG_VERBARG_SEEN)
8234              && !(RExC_seen & REG_GPOS_SEEN)
8235         ){
8236             RExC_rx->extflags |= RXf_CHECK_ALL;
8237         }
8238         scan_commit(pRExC_state, &data,&minlen, 0);
8239
8240
8241         /* XXX this is done in reverse order because that's the way the
8242          * code was before it was parameterised. Don't know whether it
8243          * actually needs doing in reverse order. DAPM */
8244         for (i = 1; i >= 0; i--) {
8245             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8246
8247             if (   !(   i
8248                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8249                      &&    data.substrs[0].min_offset
8250                         == data.substrs[1].min_offset
8251                      &&    SvCUR(data.substrs[0].str)
8252                         == SvCUR(data.substrs[1].str)
8253                     )
8254                 && S_setup_longest (aTHX_ pRExC_state,
8255                                         &(RExC_rx->substrs->data[i]),
8256                                         &(data.substrs[i]),
8257                                         longest_length[i]))
8258             {
8259                 RExC_rx->substrs->data[i].min_offset =
8260                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8261
8262                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8263                 /* Don't offset infinity */
8264                 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8265                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8266                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8267             }
8268             else {
8269                 RExC_rx->substrs->data[i].substr      = NULL;
8270                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8271                 longest_length[i] = 0;
8272             }
8273         }
8274
8275         LEAVE_with_name("study_chunk");
8276
8277         if (RExC_rxi->regstclass
8278             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8279             RExC_rxi->regstclass = NULL;
8280
8281         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8282               || RExC_rx->substrs->data[0].min_offset)
8283             && stclass_flag
8284             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8285             && is_ssc_worth_it(pRExC_state, data.start_class))
8286         {
8287             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8288
8289             ssc_finalize(pRExC_state, data.start_class);
8290
8291             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8292             StructCopy(data.start_class,
8293                        (regnode_ssc*)RExC_rxi->data->data[n],
8294                        regnode_ssc);
8295             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8296             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8297             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8298                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8299                       Perl_re_printf( aTHX_
8300                                     "synthetic stclass \"%s\".\n",
8301                                     SvPVX_const(sv));});
8302             data.start_class = NULL;
8303         }
8304
8305         /* A temporary algorithm prefers floated substr to fixed one of
8306          * same length to dig more info. */
8307         i = (longest_length[0] <= longest_length[1]);
8308         RExC_rx->substrs->check_ix = i;
8309         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8310         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8311         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8312         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8313         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8314         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8315             RExC_rx->intflags |= PREGf_NOSCAN;
8316
8317         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8318             RExC_rx->extflags |= RXf_USE_INTUIT;
8319             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8320                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8321         }
8322
8323         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8324         if ( (STRLEN)minlen < longest_length[1] )
8325             minlen= longest_length[1];
8326         if ( (STRLEN)minlen < longest_length[0] )
8327             minlen= longest_length[0];
8328         */
8329     }
8330     else {
8331         /* Several toplevels. Best we can is to set minlen. */
8332         SSize_t fake;
8333         regnode_ssc ch_class;
8334         SSize_t last_close = 0;
8335
8336         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8337
8338         scan = RExC_rxi->program + 1;
8339         ssc_init(pRExC_state, &ch_class);
8340         data.start_class = &ch_class;
8341         data.last_closep = &last_close;
8342
8343         DEBUG_RExC_seen();
8344         /*
8345          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8346          * (patterns WITH top level branches)
8347          */
8348         minlen = study_chunk(pRExC_state,
8349             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8350             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8351                                                       ? SCF_TRIE_DOING_RESTUDY
8352                                                       : 0),
8353             0, TRUE);
8354
8355         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8356
8357         RExC_rx->check_substr = NULL;
8358         RExC_rx->check_utf8 = NULL;
8359         RExC_rx->substrs->data[0].substr      = NULL;
8360         RExC_rx->substrs->data[0].utf8_substr = NULL;
8361         RExC_rx->substrs->data[1].substr      = NULL;
8362         RExC_rx->substrs->data[1].utf8_substr = NULL;
8363
8364         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8365             && is_ssc_worth_it(pRExC_state, data.start_class))
8366         {
8367             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8368
8369             ssc_finalize(pRExC_state, data.start_class);
8370
8371             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8372             StructCopy(data.start_class,
8373                        (regnode_ssc*)RExC_rxi->data->data[n],
8374                        regnode_ssc);
8375             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8376             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8377             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8378                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8379                       Perl_re_printf( aTHX_
8380                                     "synthetic stclass \"%s\".\n",
8381                                     SvPVX_const(sv));});
8382             data.start_class = NULL;
8383         }
8384     }
8385
8386     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8387         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8388         RExC_rx->maxlen = REG_INFTY;
8389     }
8390     else {
8391         RExC_rx->maxlen = RExC_maxlen;
8392     }
8393
8394     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8395        the "real" pattern. */
8396     DEBUG_OPTIMISE_r({
8397         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8398                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8399     });
8400     RExC_rx->minlenret = minlen;
8401     if (RExC_rx->minlen < minlen)
8402         RExC_rx->minlen = minlen;
8403
8404     if (RExC_seen & REG_RECURSE_SEEN ) {
8405         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8406         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8407     }
8408     if (RExC_seen & REG_GPOS_SEEN)
8409         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8410     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8411         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8412                                                 lookbehind */
8413     if (pRExC_state->code_blocks)
8414         RExC_rx->extflags |= RXf_EVAL_SEEN;
8415     if (RExC_seen & REG_VERBARG_SEEN)
8416     {
8417         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8418         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8419     }
8420     if (RExC_seen & REG_CUTGROUP_SEEN)
8421         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8422     if (pm_flags & PMf_USE_RE_EVAL)
8423         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8424     if (RExC_paren_names)
8425         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8426     else
8427         RXp_PAREN_NAMES(RExC_rx) = NULL;
8428
8429     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8430      * so it can be used in pp.c */
8431     if (RExC_rx->intflags & PREGf_ANCH)
8432         RExC_rx->extflags |= RXf_IS_ANCHORED;
8433
8434
8435     {
8436         /* this is used to identify "special" patterns that might result
8437          * in Perl NOT calling the regex engine and instead doing the match "itself",
8438          * particularly special cases in split//. By having the regex compiler
8439          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8440          * we avoid weird issues with equivalent patterns resulting in different behavior,
8441          * AND we allow non Perl engines to get the same optimizations by the setting the
8442          * flags appropriately - Yves */
8443         regnode *first = RExC_rxi->program + 1;
8444         U8 fop = OP(first);
8445         regnode *next = regnext(first);
8446         U8 nop = OP(next);
8447
8448         if (PL_regkind[fop] == NOTHING && nop == END)
8449             RExC_rx->extflags |= RXf_NULL;
8450         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8451             /* when fop is SBOL first->flags will be true only when it was
8452              * produced by parsing /\A/, and not when parsing /^/. This is
8453              * very important for the split code as there we want to
8454              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8455              * See rt #122761 for more details. -- Yves */
8456             RExC_rx->extflags |= RXf_START_ONLY;
8457         else if (fop == PLUS
8458                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8459                  && nop == END)
8460             RExC_rx->extflags |= RXf_WHITE;
8461         else if ( RExC_rx->extflags & RXf_SPLIT
8462                   && (PL_regkind[fop] == EXACT && ! isEXACTFish(fop))
8463                   && STR_LEN(first) == 1
8464                   && *(STRING(first)) == ' '
8465                   && nop == END )
8466             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8467
8468     }
8469
8470     if (RExC_contains_locale) {
8471         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8472     }
8473
8474 #ifdef DEBUGGING
8475     if (RExC_paren_names) {
8476         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8477         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8478                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8479     } else
8480 #endif
8481     RExC_rxi->name_list_idx = 0;
8482
8483     while ( RExC_recurse_count > 0 ) {
8484         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8485         /*
8486          * This data structure is set up in study_chunk() and is used
8487          * to calculate the distance between a GOSUB regopcode and
8488          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8489          * it refers to.
8490          *
8491          * If for some reason someone writes code that optimises
8492          * away a GOSUB opcode then the assert should be changed to
8493          * an if(scan) to guard the ARG2L_SET() - Yves
8494          *
8495          */
8496         assert(scan && OP(scan) == GOSUB);
8497         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8498     }
8499
8500     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8501     /* assume we don't need to swap parens around before we match */
8502     DEBUG_TEST_r({
8503         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8504             (unsigned long)RExC_study_chunk_recursed_count);
8505     });
8506     DEBUG_DUMP_r({
8507         DEBUG_RExC_seen();
8508         Perl_re_printf( aTHX_ "Final program:\n");
8509         regdump(RExC_rx);
8510     });
8511
8512     if (RExC_open_parens) {
8513         Safefree(RExC_open_parens);
8514         RExC_open_parens = NULL;
8515     }
8516     if (RExC_close_parens) {
8517         Safefree(RExC_close_parens);
8518         RExC_close_parens = NULL;
8519     }
8520
8521 #ifdef USE_ITHREADS
8522     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8523      * by setting the regexp SV to readonly-only instead. If the
8524      * pattern's been recompiled, the USEDness should remain. */
8525     if (old_re && SvREADONLY(old_re))
8526         SvREADONLY_on(Rx);
8527 #endif
8528     return Rx;
8529 }
8530
8531
8532 SV*
8533 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8534                     const U32 flags)
8535 {
8536     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8537
8538     PERL_UNUSED_ARG(value);
8539
8540     if (flags & RXapif_FETCH) {
8541         return reg_named_buff_fetch(rx, key, flags);
8542     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8543         Perl_croak_no_modify();
8544         return NULL;
8545     } else if (flags & RXapif_EXISTS) {
8546         return reg_named_buff_exists(rx, key, flags)
8547             ? &PL_sv_yes
8548             : &PL_sv_no;
8549     } else if (flags & RXapif_REGNAMES) {
8550         return reg_named_buff_all(rx, flags);
8551     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8552         return reg_named_buff_scalar(rx, flags);
8553     } else {
8554         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8555         return NULL;
8556     }
8557 }
8558
8559 SV*
8560 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8561                          const U32 flags)
8562 {
8563     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8564     PERL_UNUSED_ARG(lastkey);
8565
8566     if (flags & RXapif_FIRSTKEY)
8567         return reg_named_buff_firstkey(rx, flags);
8568     else if (flags & RXapif_NEXTKEY)
8569         return reg_named_buff_nextkey(rx, flags);
8570     else {
8571         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8572                                             (int)flags);
8573         return NULL;
8574     }
8575 }
8576
8577 SV*
8578 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8579                           const U32 flags)
8580 {
8581     SV *ret;
8582     struct regexp *const rx = ReANY(r);
8583
8584     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8585
8586     if (rx && RXp_PAREN_NAMES(rx)) {
8587         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8588         if (he_str) {
8589             IV i;
8590             SV* sv_dat=HeVAL(he_str);
8591             I32 *nums=(I32*)SvPVX(sv_dat);
8592             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8593             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8594                 if ((I32)(rx->nparens) >= nums[i]
8595                     && rx->offs[nums[i]].start != -1
8596                     && rx->offs[nums[i]].end != -1)
8597                 {
8598                     ret = newSVpvs("");
8599                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8600                     if (!retarray)
8601                         return ret;
8602                 } else {
8603                     if (retarray)
8604                         ret = newSVsv(&PL_sv_undef);
8605                 }
8606                 if (retarray)
8607                     av_push(retarray, ret);
8608             }
8609             if (retarray)
8610                 return newRV_noinc(MUTABLE_SV(retarray));
8611         }
8612     }
8613     return NULL;
8614 }
8615
8616 bool
8617 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8618                            const U32 flags)
8619 {
8620     struct regexp *const rx = ReANY(r);
8621
8622     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8623
8624     if (rx && RXp_PAREN_NAMES(rx)) {
8625         if (flags & RXapif_ALL) {
8626             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8627         } else {
8628             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8629             if (sv) {
8630                 SvREFCNT_dec_NN(sv);
8631                 return TRUE;
8632             } else {
8633                 return FALSE;
8634             }
8635         }
8636     } else {
8637         return FALSE;
8638     }
8639 }
8640
8641 SV*
8642 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8643 {
8644     struct regexp *const rx = ReANY(r);
8645
8646     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8647
8648     if ( rx && RXp_PAREN_NAMES(rx) ) {
8649         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8650
8651         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8652     } else {
8653         return FALSE;
8654     }
8655 }
8656
8657 SV*
8658 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8659 {
8660     struct regexp *const rx = ReANY(r);
8661     DECLARE_AND_GET_RE_DEBUG_FLAGS;
8662
8663     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8664
8665     if (rx && RXp_PAREN_NAMES(rx)) {
8666         HV *hv = RXp_PAREN_NAMES(rx);
8667         HE *temphe;
8668         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8669             IV i;
8670             IV parno = 0;
8671             SV* sv_dat = HeVAL(temphe);
8672             I32 *nums = (I32*)SvPVX(sv_dat);
8673             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8674                 if ((I32)(rx->lastparen) >= nums[i] &&
8675                     rx->offs[nums[i]].start != -1 &&
8676                     rx->offs[nums[i]].end != -1)
8677                 {
8678                     parno = nums[i];
8679                     break;
8680                 }
8681             }
8682             if (parno || flags & RXapif_ALL) {
8683                 return newSVhek(HeKEY_hek(temphe));
8684             }
8685         }
8686     }
8687     return NULL;
8688 }
8689
8690 SV*
8691 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8692 {
8693     SV *ret;
8694     AV *av;
8695     SSize_t length;
8696     struct regexp *const rx = ReANY(r);
8697
8698     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8699
8700     if (rx && RXp_PAREN_NAMES(rx)) {
8701         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8702             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8703         } else if (flags & RXapif_ONE) {
8704             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8705             av = MUTABLE_AV(SvRV(ret));
8706             length = av_count(av);
8707             SvREFCNT_dec_NN(ret);
8708             return newSViv(length);
8709         } else {
8710             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8711                                                 (int)flags);
8712             return NULL;
8713         }
8714     }
8715     return &PL_sv_undef;
8716 }
8717
8718 SV*
8719 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8720 {
8721     struct regexp *const rx = ReANY(r);
8722     AV *av = newAV();
8723
8724     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8725
8726     if (rx && RXp_PAREN_NAMES(rx)) {
8727         HV *hv= RXp_PAREN_NAMES(rx);
8728         HE *temphe;
8729         (void)hv_iterinit(hv);
8730         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8731             IV i;
8732             IV parno = 0;
8733             SV* sv_dat = HeVAL(temphe);
8734             I32 *nums = (I32*)SvPVX(sv_dat);
8735             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8736                 if ((I32)(rx->lastparen) >= nums[i] &&
8737                     rx->offs[nums[i]].start != -1 &&
8738                     rx->offs[nums[i]].end != -1)
8739                 {
8740                     parno = nums[i];
8741                     break;
8742                 }
8743             }
8744             if (parno || flags & RXapif_ALL) {
8745                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8746             }
8747         }
8748     }
8749
8750     return newRV_noinc(MUTABLE_SV(av));
8751 }
8752
8753 void
8754 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8755                              SV * const sv)
8756 {
8757     struct regexp *const rx = ReANY(r);
8758     char *s = NULL;
8759     SSize_t i = 0;
8760     SSize_t s1, t1;
8761     I32 n = paren;
8762
8763     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8764
8765     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8766            || n == RX_BUFF_IDX_CARET_FULLMATCH
8767            || n == RX_BUFF_IDX_CARET_POSTMATCH
8768        )
8769     {
8770         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8771         if (!keepcopy) {
8772             /* on something like
8773              *    $r = qr/.../;
8774              *    /$qr/p;
8775              * the KEEPCOPY is set on the PMOP rather than the regex */
8776             if (PL_curpm && r == PM_GETRE(PL_curpm))
8777                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8778         }
8779         if (!keepcopy)
8780             goto ret_undef;
8781     }
8782
8783     if (!rx->subbeg)
8784         goto ret_undef;
8785
8786     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8787         /* no need to distinguish between them any more */
8788         n = RX_BUFF_IDX_FULLMATCH;
8789
8790     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8791         && rx->offs[0].start != -1)
8792     {
8793         /* $`, ${^PREMATCH} */
8794         i = rx->offs[0].start;
8795         s = rx->subbeg;
8796     }
8797     else
8798     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8799         && rx->offs[0].end != -1)
8800     {
8801         /* $', ${^POSTMATCH} */
8802         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8803         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8804     }
8805     else
8806     if (inRANGE(n, 0, (I32)rx->nparens) &&
8807         (s1 = rx->offs[n].start) != -1  &&
8808         (t1 = rx->offs[n].end) != -1)
8809     {
8810         /* $&, ${^MATCH},  $1 ... */
8811         i = t1 - s1;
8812         s = rx->subbeg + s1 - rx->suboffset;
8813     } else {
8814         goto ret_undef;
8815     }
8816
8817     assert(s >= rx->subbeg);
8818     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8819     if (i >= 0) {
8820 #ifdef NO_TAINT_SUPPORT
8821         sv_setpvn(sv, s, i);
8822 #else
8823         const int oldtainted = TAINT_get;
8824         TAINT_NOT;
8825         sv_setpvn(sv, s, i);
8826         TAINT_set(oldtainted);
8827 #endif
8828         if (RXp_MATCH_UTF8(rx))
8829             SvUTF8_on(sv);
8830         else
8831             SvUTF8_off(sv);
8832         if (TAINTING_get) {
8833             if (RXp_MATCH_TAINTED(rx)) {
8834                 if (SvTYPE(sv) >= SVt_PVMG) {
8835                     MAGIC* const mg = SvMAGIC(sv);
8836                     MAGIC* mgt;
8837                     TAINT;
8838                     SvMAGIC_set(sv, mg->mg_moremagic);
8839                     SvTAINT(sv);
8840                     if ((mgt = SvMAGIC(sv))) {
8841                         mg->mg_moremagic = mgt;
8842                         SvMAGIC_set(sv, mg);
8843                     }
8844                 } else {
8845                     TAINT;
8846                     SvTAINT(sv);
8847                 }
8848             } else
8849                 SvTAINTED_off(sv);
8850         }
8851     } else {
8852       ret_undef:
8853         sv_set_undef(sv);
8854         return;
8855     }
8856 }
8857
8858 void
8859 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8860                                                          SV const * const value)
8861 {
8862     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8863
8864     PERL_UNUSED_ARG(rx);
8865     PERL_UNUSED_ARG(paren);
8866     PERL_UNUSED_ARG(value);
8867
8868     if (!PL_localizing)
8869         Perl_croak_no_modify();
8870 }
8871
8872 I32
8873 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8874                               const I32 paren)
8875 {
8876     struct regexp *const rx = ReANY(r);
8877     I32 i;
8878     I32 s1, t1;
8879
8880     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8881
8882     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8883         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8884         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8885     )
8886     {
8887         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8888         if (!keepcopy) {
8889             /* on something like
8890              *    $r = qr/.../;
8891              *    /$qr/p;
8892              * the KEEPCOPY is set on the PMOP rather than the regex */
8893             if (PL_curpm && r == PM_GETRE(PL_curpm))
8894                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8895         }
8896         if (!keepcopy)
8897             goto warn_undef;
8898     }
8899
8900     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8901     switch (paren) {
8902       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8903       case RX_BUFF_IDX_PREMATCH:       /* $` */
8904         if (rx->offs[0].start != -1) {
8905                         i = rx->offs[0].start;
8906                         if (i > 0) {
8907                                 s1 = 0;
8908                                 t1 = i;
8909                                 goto getlen;
8910                         }
8911             }
8912         return 0;
8913
8914       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8915       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8916             if (rx->offs[0].end != -1) {
8917                         i = rx->sublen - rx->offs[0].end;
8918                         if (i > 0) {
8919                                 s1 = rx->offs[0].end;
8920                                 t1 = rx->sublen;
8921                                 goto getlen;
8922                         }
8923             }
8924         return 0;
8925
8926       default: /* $& / ${^MATCH}, $1, $2, ... */
8927             if (paren <= (I32)rx->nparens &&
8928             (s1 = rx->offs[paren].start) != -1 &&
8929             (t1 = rx->offs[paren].end) != -1)
8930             {
8931             i = t1 - s1;
8932             goto getlen;
8933         } else {
8934           warn_undef:
8935             if (ckWARN(WARN_UNINITIALIZED))
8936                 report_uninit((const SV *)sv);
8937             return 0;
8938         }
8939     }
8940   getlen:
8941     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8942         const char * const s = rx->subbeg - rx->suboffset + s1;
8943         const U8 *ep;
8944         STRLEN el;
8945
8946         i = t1 - s1;
8947         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8948             i = el;
8949     }
8950     return i;
8951 }
8952
8953 SV*
8954 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8955 {
8956     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8957         PERL_UNUSED_ARG(rx);
8958         if (0)
8959             return NULL;
8960         else
8961             return newSVpvs("Regexp");
8962 }
8963
8964 /* Scans the name of a named buffer from the pattern.
8965  * If flags is REG_RSN_RETURN_NULL returns null.
8966  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8967  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8968  * to the parsed name as looked up in the RExC_paren_names hash.
8969  * If there is an error throws a vFAIL().. type exception.
8970  */
8971
8972 #define REG_RSN_RETURN_NULL    0
8973 #define REG_RSN_RETURN_NAME    1
8974 #define REG_RSN_RETURN_DATA    2
8975
8976 STATIC SV*
8977 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8978 {
8979     char *name_start = RExC_parse;
8980     SV* sv_name;
8981
8982     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8983
8984     assert (RExC_parse <= RExC_end);
8985     if (RExC_parse == RExC_end) NOOP;
8986     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8987          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8988           * using do...while */
8989         if (UTF)
8990             do {
8991                 RExC_parse += UTF8SKIP(RExC_parse);
8992             } while (   RExC_parse < RExC_end
8993                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8994         else
8995             do {
8996                 RExC_parse++;
8997             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8998     } else {
8999         RExC_parse++; /* so the <- from the vFAIL is after the offending
9000                          character */
9001         vFAIL("Group name must start with a non-digit word character");
9002     }
9003     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9004                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9005     if ( flags == REG_RSN_RETURN_NAME)
9006         return sv_name;
9007     else if (flags==REG_RSN_RETURN_DATA) {
9008         HE *he_str = NULL;
9009         SV *sv_dat = NULL;
9010         if ( ! sv_name )      /* should not happen*/
9011             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9012         if (RExC_paren_names)
9013             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9014         if ( he_str )
9015             sv_dat = HeVAL(he_str);
9016         if ( ! sv_dat ) {   /* Didn't find group */
9017
9018             /* It might be a forward reference; we can't fail until we
9019                 * know, by completing the parse to get all the groups, and
9020                 * then reparsing */
9021             if (ALL_PARENS_COUNTED)  {
9022                 vFAIL("Reference to nonexistent named group");
9023             }
9024             else {
9025                 REQUIRE_PARENS_PASS;
9026             }
9027         }
9028         return sv_dat;
9029     }
9030
9031     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9032                      (unsigned long) flags);
9033 }
9034
9035 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
9036     if (RExC_lastparse!=RExC_parse) {                           \
9037         Perl_re_printf( aTHX_  "%s",                            \
9038             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
9039                 RExC_end - RExC_parse, 16,                      \
9040                 "", "",                                         \
9041                 PERL_PV_ESCAPE_UNI_DETECT |                     \
9042                 PERL_PV_PRETTY_ELLIPSES   |                     \
9043                 PERL_PV_PRETTY_LTGT       |                     \
9044                 PERL_PV_ESCAPE_RE         |                     \
9045                 PERL_PV_PRETTY_EXACTSIZE                        \
9046             )                                                   \
9047         );                                                      \
9048     } else                                                      \
9049         Perl_re_printf( aTHX_ "%16s","");                       \
9050                                                                 \
9051     if (RExC_lastnum!=RExC_emit)                                \
9052        Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
9053     else                                                        \
9054        Perl_re_printf( aTHX_ "|%4s","");                        \
9055     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
9056         (int)((depth*2)), "",                                   \
9057         (funcname)                                              \
9058     );                                                          \
9059     RExC_lastnum=RExC_emit;                                     \
9060     RExC_lastparse=RExC_parse;                                  \
9061 })
9062
9063
9064
9065 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
9066     DEBUG_PARSE_MSG((funcname));                            \
9067     Perl_re_printf( aTHX_ "%4s","\n");                                  \
9068 })
9069 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
9070     DEBUG_PARSE_MSG((funcname));                            \
9071     Perl_re_printf( aTHX_ fmt "\n",args);                               \
9072 })
9073
9074 /* This section of code defines the inversion list object and its methods.  The
9075  * interfaces are highly subject to change, so as much as possible is static to
9076  * this file.  An inversion list is here implemented as a malloc'd C UV array
9077  * as an SVt_INVLIST scalar.
9078  *
9079  * An inversion list for Unicode is an array of code points, sorted by ordinal
9080  * number.  Each element gives the code point that begins a range that extends
9081  * up-to but not including the code point given by the next element.  The final
9082  * element gives the first code point of a range that extends to the platform's
9083  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
9084  * ...) give ranges whose code points are all in the inversion list.  We say
9085  * that those ranges are in the set.  The odd-numbered elements give ranges
9086  * whose code points are not in the inversion list, and hence not in the set.
9087  * Thus, element [0] is the first code point in the list.  Element [1]
9088  * is the first code point beyond that not in the list; and element [2] is the
9089  * first code point beyond that that is in the list.  In other words, the first
9090  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9091  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
9092  * all code points in that range are not in the inversion list.  The third
9093  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9094  * list, and so forth.  Thus every element whose index is divisible by two
9095  * gives the beginning of a range that is in the list, and every element whose
9096  * index is not divisible by two gives the beginning of a range not in the
9097  * list.  If the final element's index is divisible by two, the inversion list
9098  * extends to the platform's infinity; otherwise the highest code point in the
9099  * inversion list is the contents of that element minus 1.
9100  *
9101  * A range that contains just a single code point N will look like
9102  *  invlist[i]   == N
9103  *  invlist[i+1] == N+1
9104  *
9105  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9106  * impossible to represent, so element [i+1] is omitted.  The single element
9107  * inversion list
9108  *  invlist[0] == UV_MAX
9109  * contains just UV_MAX, but is interpreted as matching to infinity.
9110  *
9111  * Taking the complement (inverting) an inversion list is quite simple, if the
9112  * first element is 0, remove it; otherwise add a 0 element at the beginning.
9113  * This implementation reserves an element at the beginning of each inversion
9114  * list to always contain 0; there is an additional flag in the header which
9115  * indicates if the list begins at the 0, or is offset to begin at the next
9116  * element.  This means that the inversion list can be inverted without any
9117  * copying; just flip the flag.
9118  *
9119  * More about inversion lists can be found in "Unicode Demystified"
9120  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9121  *
9122  * The inversion list data structure is currently implemented as an SV pointing
9123  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
9124  * array of UV whose memory management is automatically handled by the existing
9125  * facilities for SV's.
9126  *
9127  * Some of the methods should always be private to the implementation, and some
9128  * should eventually be made public */
9129
9130 /* The header definitions are in F<invlist_inline.h> */
9131
9132 #ifndef PERL_IN_XSUB_RE
9133
9134 PERL_STATIC_INLINE UV*
9135 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9136 {
9137     /* Returns a pointer to the first element in the inversion list's array.
9138      * This is called upon initialization of an inversion list.  Where the
9139      * array begins depends on whether the list has the code point U+0000 in it
9140      * or not.  The other parameter tells it whether the code that follows this
9141      * call is about to put a 0 in the inversion list or not.  The first
9142      * element is either the element reserved for 0, if TRUE, or the element
9143      * after it, if FALSE */
9144
9145     bool* offset = get_invlist_offset_addr(invlist);
9146     UV* zero_addr = (UV *) SvPVX(invlist);
9147
9148     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9149
9150     /* Must be empty */
9151     assert(! _invlist_len(invlist));
9152
9153     *zero_addr = 0;
9154
9155     /* 1^1 = 0; 1^0 = 1 */
9156     *offset = 1 ^ will_have_0;
9157     return zero_addr + *offset;
9158 }
9159
9160 STATIC void
9161 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9162 {
9163     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9164      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9165      * is similar to what SvSetMagicSV() would do, if it were implemented on
9166      * inversion lists, though this routine avoids a copy */
9167
9168     const UV src_len          = _invlist_len(src);
9169     const bool src_offset     = *get_invlist_offset_addr(src);
9170     const STRLEN src_byte_len = SvLEN(src);
9171     char * array              = SvPVX(src);
9172
9173     const int oldtainted = TAINT_get;
9174
9175     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9176
9177     assert(is_invlist(src));
9178     assert(is_invlist(dest));
9179     assert(! invlist_is_iterating(src));
9180     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9181
9182     /* Make sure it ends in the right place with a NUL, as our inversion list
9183      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9184      * asserts it */
9185     array[src_byte_len - 1] = '\0';
9186
9187     TAINT_NOT;      /* Otherwise it breaks */
9188     sv_usepvn_flags(dest,
9189                     (char *) array,
9190                     src_byte_len - 1,
9191
9192                     /* This flag is documented to cause a copy to be avoided */
9193                     SV_HAS_TRAILING_NUL);
9194     TAINT_set(oldtainted);
9195     SvPV_set(src, 0);
9196     SvLEN_set(src, 0);
9197     SvCUR_set(src, 0);
9198
9199     /* Finish up copying over the other fields in an inversion list */
9200     *get_invlist_offset_addr(dest) = src_offset;
9201     invlist_set_len(dest, src_len, src_offset);
9202     *get_invlist_previous_index_addr(dest) = 0;
9203     invlist_iterfinish(dest);
9204 }
9205
9206 PERL_STATIC_INLINE IV*
9207 S_get_invlist_previous_index_addr(SV* invlist)
9208 {
9209     /* Return the address of the IV that is reserved to hold the cached index
9210      * */
9211     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9212
9213     assert(is_invlist(invlist));
9214
9215     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9216 }
9217
9218 PERL_STATIC_INLINE IV
9219 S_invlist_previous_index(SV* const invlist)
9220 {
9221     /* Returns cached index of previous search */
9222
9223     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9224
9225     return *get_invlist_previous_index_addr(invlist);
9226 }
9227
9228 PERL_STATIC_INLINE void
9229 S_invlist_set_previous_index(SV* const invlist, const IV index)
9230 {
9231     /* Caches <index> for later retrieval */
9232
9233     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9234
9235     assert(index == 0 || index < (int) _invlist_len(invlist));
9236
9237     *get_invlist_previous_index_addr(invlist) = index;
9238 }
9239
9240 PERL_STATIC_INLINE void
9241 S_invlist_trim(SV* invlist)
9242 {
9243     /* Free the not currently-being-used space in an inversion list */
9244
9245     /* But don't free up the space needed for the 0 UV that is always at the
9246      * beginning of the list, nor the trailing NUL */
9247     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9248
9249     PERL_ARGS_ASSERT_INVLIST_TRIM;
9250
9251     assert(is_invlist(invlist));
9252
9253     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9254 }
9255
9256 PERL_STATIC_INLINE void
9257 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9258 {
9259     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9260
9261     assert(is_invlist(invlist));
9262
9263     invlist_set_len(invlist, 0, 0);
9264     invlist_trim(invlist);
9265 }
9266
9267 #endif /* ifndef PERL_IN_XSUB_RE */
9268
9269 PERL_STATIC_INLINE bool
9270 S_invlist_is_iterating(SV* const invlist)
9271 {
9272     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9273
9274     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9275 }
9276
9277 #ifndef PERL_IN_XSUB_RE
9278
9279 PERL_STATIC_INLINE UV
9280 S_invlist_max(SV* const invlist)
9281 {
9282     /* Returns the maximum number of elements storable in the inversion list's
9283      * array, without having to realloc() */
9284
9285     PERL_ARGS_ASSERT_INVLIST_MAX;
9286
9287     assert(is_invlist(invlist));
9288
9289     /* Assumes worst case, in which the 0 element is not counted in the
9290      * inversion list, so subtracts 1 for that */
9291     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9292            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9293            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9294 }
9295
9296 STATIC void
9297 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9298 {
9299     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9300
9301     /* First 1 is in case the zero element isn't in the list; second 1 is for
9302      * trailing NUL */
9303     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9304     invlist_set_len(invlist, 0, 0);
9305
9306     /* Force iterinit() to be used to get iteration to work */
9307     invlist_iterfinish(invlist);
9308
9309     *get_invlist_previous_index_addr(invlist) = 0;
9310     SvPOK_on(invlist);  /* This allows B to extract the PV */
9311 }
9312
9313 SV*
9314 Perl__new_invlist(pTHX_ IV initial_size)
9315 {
9316
9317     /* Return a pointer to a newly constructed inversion list, with enough
9318      * space to store 'initial_size' elements.  If that number is negative, a
9319      * system default is used instead */
9320
9321     SV* new_list;
9322
9323     if (initial_size < 0) {
9324         initial_size = 10;
9325     }
9326
9327     new_list = newSV_type(SVt_INVLIST);
9328     initialize_invlist_guts(new_list, initial_size);
9329
9330     return new_list;
9331 }
9332
9333 SV*
9334 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9335 {
9336     /* Return a pointer to a newly constructed inversion list, initialized to
9337      * point to <list>, which has to be in the exact correct inversion list
9338      * form, including internal fields.  Thus this is a dangerous routine that
9339      * should not be used in the wrong hands.  The passed in 'list' contains
9340      * several header fields at the beginning that are not part of the
9341      * inversion list body proper */
9342
9343     const STRLEN length = (STRLEN) list[0];
9344     const UV version_id =          list[1];
9345     const bool offset   =    cBOOL(list[2]);
9346 #define HEADER_LENGTH 3
9347     /* If any of the above changes in any way, you must change HEADER_LENGTH
9348      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9349      *      perl -E 'say int(rand 2**31-1)'
9350      */
9351 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9352                                         data structure type, so that one being
9353                                         passed in can be validated to be an
9354                                         inversion list of the correct vintage.
9355                                        */
9356
9357     SV* invlist = newSV_type(SVt_INVLIST);
9358
9359     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9360
9361     if (version_id != INVLIST_VERSION_ID) {
9362         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9363     }
9364
9365     /* The generated array passed in includes header elements that aren't part
9366      * of the list proper, so start it just after them */
9367     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9368
9369     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9370                                shouldn't touch it */
9371
9372     *(get_invlist_offset_addr(invlist)) = offset;
9373
9374     /* The 'length' passed to us is the physical number of elements in the
9375      * inversion list.  But if there is an offset the logical number is one
9376      * less than that */
9377     invlist_set_len(invlist, length  - offset, offset);
9378
9379     invlist_set_previous_index(invlist, 0);
9380
9381     /* Initialize the iteration pointer. */
9382     invlist_iterfinish(invlist);
9383
9384     SvREADONLY_on(invlist);
9385     SvPOK_on(invlist);
9386
9387     return invlist;
9388 }
9389
9390 STATIC void
9391 S__append_range_to_invlist(pTHX_ SV* const invlist,
9392                                  const UV start, const UV end)
9393 {
9394    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9395     * the end of the inversion list.  The range must be above any existing
9396     * ones. */
9397
9398     UV* array;
9399     UV max = invlist_max(invlist);
9400     UV len = _invlist_len(invlist);
9401     bool offset;
9402
9403     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9404
9405     if (len == 0) { /* Empty lists must be initialized */
9406         offset = start != 0;
9407         array = _invlist_array_init(invlist, ! offset);
9408     }
9409     else {
9410         /* Here, the existing list is non-empty. The current max entry in the
9411          * list is generally the first value not in the set, except when the
9412          * set extends to the end of permissible values, in which case it is
9413          * the first entry in that final set, and so this call is an attempt to
9414          * append out-of-order */
9415
9416         UV final_element = len - 1;
9417         array = invlist_array(invlist);
9418         if (   array[final_element] > start
9419             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9420         {
9421             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9422                      array[final_element], start,
9423                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9424         }
9425
9426         /* Here, it is a legal append.  If the new range begins 1 above the end
9427          * of the range below it, it is extending the range below it, so the
9428          * new first value not in the set is one greater than the newly
9429          * extended range.  */
9430         offset = *get_invlist_offset_addr(invlist);
9431         if (array[final_element] == start) {
9432             if (end != UV_MAX) {
9433                 array[final_element] = end + 1;
9434             }
9435             else {
9436                 /* But if the end is the maximum representable on the machine,
9437                  * assume that infinity was actually what was meant.  Just let
9438                  * the range that this would extend to have no end */
9439                 invlist_set_len(invlist, len - 1, offset);
9440             }
9441             return;
9442         }
9443     }
9444
9445     /* Here the new range doesn't extend any existing set.  Add it */
9446
9447     len += 2;   /* Includes an element each for the start and end of range */
9448
9449     /* If wll overflow the existing space, extend, which may cause the array to
9450      * be moved */
9451     if (max < len) {
9452         invlist_extend(invlist, len);
9453
9454         /* Have to set len here to avoid assert failure in invlist_array() */
9455         invlist_set_len(invlist, len, offset);
9456
9457         array = invlist_array(invlist);
9458     }
9459     else {
9460         invlist_set_len(invlist, len, offset);
9461     }
9462
9463     /* The next item on the list starts the range, the one after that is
9464      * one past the new range.  */
9465     array[len - 2] = start;
9466     if (end != UV_MAX) {
9467         array[len - 1] = end + 1;
9468     }
9469     else {
9470         /* But if the end is the maximum representable on the machine, just let
9471          * the range have no end */
9472         invlist_set_len(invlist, len - 1, offset);
9473     }
9474 }
9475
9476 SSize_t
9477 Perl__invlist_search(SV* const invlist, const UV cp)
9478 {
9479     /* Searches the inversion list for the entry that contains the input code
9480      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9481      * return value is the index into the list's array of the range that
9482      * contains <cp>, that is, 'i' such that
9483      *  array[i] <= cp < array[i+1]
9484      */
9485
9486     IV low = 0;
9487     IV mid;
9488     IV high = _invlist_len(invlist);
9489     const IV highest_element = high - 1;
9490     const UV* array;
9491
9492     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9493
9494     /* If list is empty, return failure. */
9495     if (high == 0) {
9496         return -1;
9497     }
9498
9499     /* (We can't get the array unless we know the list is non-empty) */
9500     array = invlist_array(invlist);
9501
9502     mid = invlist_previous_index(invlist);
9503     assert(mid >=0);
9504     if (mid > highest_element) {
9505         mid = highest_element;
9506     }
9507
9508     /* <mid> contains the cache of the result of the previous call to this
9509      * function (0 the first time).  See if this call is for the same result,
9510      * or if it is for mid-1.  This is under the theory that calls to this
9511      * function will often be for related code points that are near each other.
9512      * And benchmarks show that caching gives better results.  We also test
9513      * here if the code point is within the bounds of the list.  These tests
9514      * replace others that would have had to be made anyway to make sure that
9515      * the array bounds were not exceeded, and these give us extra information
9516      * at the same time */
9517     if (cp >= array[mid]) {
9518         if (cp >= array[highest_element]) {
9519             return highest_element;
9520         }
9521
9522         /* Here, array[mid] <= cp < array[highest_element].  This means that
9523          * the final element is not the answer, so can exclude it; it also
9524          * means that <mid> is not the final element, so can refer to 'mid + 1'
9525          * safely */
9526         if (cp < array[mid + 1]) {
9527             return mid;
9528         }
9529         high--;
9530         low = mid + 1;
9531     }
9532     else { /* cp < aray[mid] */
9533         if (cp < array[0]) { /* Fail if outside the array */
9534             return -1;
9535         }
9536         high = mid;
9537         if (cp >= array[mid - 1]) {
9538             goto found_entry;
9539         }
9540     }
9541
9542     /* Binary search.  What we are looking for is <i> such that
9543      *  array[i] <= cp < array[i+1]
9544      * The loop below converges on the i+1.  Note that there may not be an
9545      * (i+1)th element in the array, and things work nonetheless */
9546     while (low < high) {
9547         mid = (low + high) / 2;
9548         assert(mid <= highest_element);
9549         if (array[mid] <= cp) { /* cp >= array[mid] */
9550             low = mid + 1;
9551
9552             /* We could do this extra test to exit the loop early.
9553             if (cp < array[low]) {
9554                 return mid;
9555             }
9556             */
9557         }
9558         else { /* cp < array[mid] */
9559             high = mid;
9560         }
9561     }
9562
9563   found_entry:
9564     high--;
9565     invlist_set_previous_index(invlist, high);
9566     return high;
9567 }
9568
9569 void
9570 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9571                                          const bool complement_b, SV** output)
9572 {
9573     /* Take the union of two inversion lists and point '*output' to it.  On
9574      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9575      * even 'a' or 'b').  If to an inversion list, the contents of the original
9576      * list will be replaced by the union.  The first list, 'a', may be
9577      * NULL, in which case a copy of the second list is placed in '*output'.
9578      * If 'complement_b' is TRUE, the union is taken of the complement
9579      * (inversion) of 'b' instead of b itself.
9580      *
9581      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9582      * Richard Gillam, published by Addison-Wesley, and explained at some
9583      * length there.  The preface says to incorporate its examples into your
9584      * code at your own risk.
9585      *
9586      * The algorithm is like a merge sort. */
9587
9588     const UV* array_a;    /* a's array */
9589     const UV* array_b;
9590     UV len_a;       /* length of a's array */
9591     UV len_b;
9592
9593     SV* u;                      /* the resulting union */
9594     UV* array_u;
9595     UV len_u = 0;
9596
9597     UV i_a = 0;             /* current index into a's array */
9598     UV i_b = 0;
9599     UV i_u = 0;
9600
9601     /* running count, as explained in the algorithm source book; items are
9602      * stopped accumulating and are output when the count changes to/from 0.
9603      * The count is incremented when we start a range that's in an input's set,
9604      * and decremented when we start a range that's not in a set.  So this
9605      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9606      * and hence nothing goes into the union; 1, just one of the inputs is in
9607      * its set (and its current range gets added to the union); and 2 when both
9608      * inputs are in their sets.  */
9609     UV count = 0;
9610
9611     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9612     assert(a != b);
9613     assert(*output == NULL || is_invlist(*output));
9614
9615     len_b = _invlist_len(b);
9616     if (len_b == 0) {
9617
9618         /* Here, 'b' is empty, hence it's complement is all possible code
9619          * points.  So if the union includes the complement of 'b', it includes
9620          * everything, and we need not even look at 'a'.  It's easiest to
9621          * create a new inversion list that matches everything.  */
9622         if (complement_b) {
9623             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9624
9625             if (*output == NULL) { /* If the output didn't exist, just point it
9626                                       at the new list */
9627                 *output = everything;
9628             }
9629             else { /* Otherwise, replace its contents with the new list */
9630                 invlist_replace_list_destroys_src(*output, everything);
9631                 SvREFCNT_dec_NN(everything);
9632             }
9633
9634             return;
9635         }
9636
9637         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9638          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9639          * output will be empty */
9640
9641         if (a == NULL || _invlist_len(a) == 0) {
9642             if (*output == NULL) {
9643                 *output = _new_invlist(0);
9644             }
9645             else {
9646                 invlist_clear(*output);
9647             }
9648             return;
9649         }
9650
9651         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9652          * union.  We can just return a copy of 'a' if '*output' doesn't point
9653          * to an existing list */
9654         if (*output == NULL) {
9655             *output = invlist_clone(a, NULL);
9656             return;
9657         }
9658
9659         /* If the output is to overwrite 'a', we have a no-op, as it's
9660          * already in 'a' */
9661         if (*output == a) {
9662             return;
9663         }
9664
9665         /* Here, '*output' is to be overwritten by 'a' */
9666         u = invlist_clone(a, NULL);
9667         invlist_replace_list_destroys_src(*output, u);
9668         SvREFCNT_dec_NN(u);
9669
9670         return;
9671     }
9672
9673     /* Here 'b' is not empty.  See about 'a' */
9674
9675     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9676
9677         /* Here, 'a' is empty (and b is not).  That means the union will come
9678          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9679          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9680          * the clone */
9681
9682         SV ** dest = (*output == NULL) ? output : &u;
9683         *dest = invlist_clone(b, NULL);
9684         if (complement_b) {
9685             _invlist_invert(*dest);
9686         }
9687
9688         if (dest == &u) {
9689             invlist_replace_list_destroys_src(*output, u);
9690             SvREFCNT_dec_NN(u);
9691         }
9692
9693         return;
9694     }
9695
9696     /* Here both lists exist and are non-empty */
9697     array_a = invlist_array(a);
9698     array_b = invlist_array(b);
9699
9700     /* If are to take the union of 'a' with the complement of b, set it
9701      * up so are looking at b's complement. */
9702     if (complement_b) {
9703
9704         /* To complement, we invert: if the first element is 0, remove it.  To
9705          * do this, we just pretend the array starts one later */
9706         if (array_b[0] == 0) {
9707             array_b++;
9708             len_b--;
9709         }
9710         else {
9711
9712             /* But if the first element is not zero, we pretend the list starts
9713              * at the 0 that is always stored immediately before the array. */
9714             array_b--;
9715             len_b++;
9716         }
9717     }
9718
9719     /* Size the union for the worst case: that the sets are completely
9720      * disjoint */
9721     u = _new_invlist(len_a + len_b);
9722
9723     /* Will contain U+0000 if either component does */
9724     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9725                                       || (len_b > 0 && array_b[0] == 0));
9726
9727     /* Go through each input list item by item, stopping when have exhausted
9728      * one of them */
9729     while (i_a < len_a && i_b < len_b) {
9730         UV cp;      /* The element to potentially add to the union's array */
9731         bool cp_in_set;   /* is it in the input list's set or not */
9732
9733         /* We need to take one or the other of the two inputs for the union.
9734          * Since we are merging two sorted lists, we take the smaller of the
9735          * next items.  In case of a tie, we take first the one that is in its
9736          * set.  If we first took the one not in its set, it would decrement
9737          * the count, possibly to 0 which would cause it to be output as ending
9738          * the range, and the next time through we would take the same number,
9739          * and output it again as beginning the next range.  By doing it the
9740          * opposite way, there is no possibility that the count will be
9741          * momentarily decremented to 0, and thus the two adjoining ranges will
9742          * be seamlessly merged.  (In a tie and both are in the set or both not
9743          * in the set, it doesn't matter which we take first.) */
9744         if (       array_a[i_a] < array_b[i_b]
9745             || (   array_a[i_a] == array_b[i_b]
9746                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9747         {
9748             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9749             cp = array_a[i_a++];
9750         }
9751         else {
9752             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9753             cp = array_b[i_b++];
9754         }
9755
9756         /* Here, have chosen which of the two inputs to look at.  Only output
9757          * if the running count changes to/from 0, which marks the
9758          * beginning/end of a range that's in the set */
9759         if (cp_in_set) {
9760             if (count == 0) {
9761                 array_u[i_u++] = cp;
9762             }
9763             count++;
9764         }
9765         else {
9766             count--;
9767             if (count == 0) {
9768                 array_u[i_u++] = cp;
9769             }
9770         }
9771     }
9772
9773
9774     /* The loop above increments the index into exactly one of the input lists
9775      * each iteration, and ends when either index gets to its list end.  That
9776      * means the other index is lower than its end, and so something is
9777      * remaining in that one.  We decrement 'count', as explained below, if
9778      * that list is in its set.  (i_a and i_b each currently index the element
9779      * beyond the one we care about.) */
9780     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9781         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9782     {
9783         count--;
9784     }
9785
9786     /* Above we decremented 'count' if the list that had unexamined elements in
9787      * it was in its set.  This has made it so that 'count' being non-zero
9788      * means there isn't anything left to output; and 'count' equal to 0 means
9789      * that what is left to output is precisely that which is left in the
9790      * non-exhausted input list.
9791      *
9792      * To see why, note first that the exhausted input obviously has nothing
9793      * left to add to the union.  If it was in its set at its end, that means
9794      * the set extends from here to the platform's infinity, and hence so does
9795      * the union and the non-exhausted set is irrelevant.  The exhausted set
9796      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9797      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9798      * 'count' remains at 1.  This is consistent with the decremented 'count'
9799      * != 0 meaning there's nothing left to add to the union.
9800      *
9801      * But if the exhausted input wasn't in its set, it contributed 0 to
9802      * 'count', and the rest of the union will be whatever the other input is.
9803      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9804      * otherwise it gets decremented to 0.  This is consistent with 'count'
9805      * == 0 meaning the remainder of the union is whatever is left in the
9806      * non-exhausted list. */
9807     if (count != 0) {
9808         len_u = i_u;
9809     }
9810     else {
9811         IV copy_count = len_a - i_a;
9812         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9813             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9814         }
9815         else { /* The non-exhausted input is b */
9816             copy_count = len_b - i_b;
9817             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9818         }
9819         len_u = i_u + copy_count;
9820     }
9821
9822     /* Set the result to the final length, which can change the pointer to
9823      * array_u, so re-find it.  (Note that it is unlikely that this will
9824      * change, as we are shrinking the space, not enlarging it) */
9825     if (len_u != _invlist_len(u)) {
9826         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9827         invlist_trim(u);
9828         array_u = invlist_array(u);
9829     }
9830
9831     if (*output == NULL) {  /* Simply return the new inversion list */
9832         *output = u;
9833     }
9834     else {
9835         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9836          * could instead free '*output', and then set it to 'u', but experience
9837          * has shown [perl #127392] that if the input is a mortal, we can get a
9838          * huge build-up of these during regex compilation before they get
9839          * freed. */
9840         invlist_replace_list_destroys_src(*output, u);
9841         SvREFCNT_dec_NN(u);
9842     }
9843
9844     return;
9845 }
9846
9847 void
9848 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9849                                                const bool complement_b, SV** i)
9850 {
9851     /* Take the intersection of two inversion lists and point '*i' to it.  On
9852      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9853      * even 'a' or 'b').  If to an inversion list, the contents of the original
9854      * list will be replaced by the intersection.  The first list, 'a', may be
9855      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9856      * TRUE, the result will be the intersection of 'a' and the complement (or
9857      * inversion) of 'b' instead of 'b' directly.
9858      *
9859      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9860      * Richard Gillam, published by Addison-Wesley, and explained at some
9861      * length there.  The preface says to incorporate its examples into your
9862      * code at your own risk.  In fact, it had bugs
9863      *
9864      * The algorithm is like a merge sort, and is essentially the same as the
9865      * union above
9866      */
9867
9868     const UV* array_a;          /* a's array */
9869     const UV* array_b;
9870     UV len_a;   /* length of a's array */
9871     UV len_b;
9872
9873     SV* r;                   /* the resulting intersection */
9874     UV* array_r;
9875     UV len_r = 0;
9876
9877     UV i_a = 0;             /* current index into a's array */
9878     UV i_b = 0;
9879     UV i_r = 0;
9880
9881     /* running count of how many of the two inputs are postitioned at ranges
9882      * that are in their sets.  As explained in the algorithm source book,
9883      * items are stopped accumulating and are output when the count changes
9884      * to/from 2.  The count is incremented when we start a range that's in an
9885      * input's set, and decremented when we start a range that's not in a set.
9886      * Only when it is 2 are we in the intersection. */
9887     UV count = 0;
9888
9889     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9890     assert(a != b);
9891     assert(*i == NULL || is_invlist(*i));
9892
9893     /* Special case if either one is empty */
9894     len_a = (a == NULL) ? 0 : _invlist_len(a);
9895     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9896         if (len_a != 0 && complement_b) {
9897
9898             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9899              * must be empty.  Here, also we are using 'b's complement, which
9900              * hence must be every possible code point.  Thus the intersection
9901              * is simply 'a'. */
9902
9903             if (*i == a) {  /* No-op */
9904                 return;
9905             }
9906
9907             if (*i == NULL) {
9908                 *i = invlist_clone(a, NULL);
9909                 return;
9910             }
9911
9912             r = invlist_clone(a, NULL);
9913             invlist_replace_list_destroys_src(*i, r);
9914             SvREFCNT_dec_NN(r);
9915             return;
9916         }
9917
9918         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9919          * intersection must be empty */
9920         if (*i == NULL) {
9921             *i = _new_invlist(0);
9922             return;
9923         }
9924
9925         invlist_clear(*i);
9926         return;
9927     }
9928
9929     /* Here both lists exist and are non-empty */
9930     array_a = invlist_array(a);
9931     array_b = invlist_array(b);
9932
9933     /* If are to take the intersection of 'a' with the complement of b, set it
9934      * up so are looking at b's complement. */
9935     if (complement_b) {
9936
9937         /* To complement, we invert: if the first element is 0, remove it.  To
9938          * do this, we just pretend the array starts one later */
9939         if (array_b[0] == 0) {
9940             array_b++;
9941             len_b--;
9942         }
9943         else {
9944
9945             /* But if the first element is not zero, we pretend the list starts
9946              * at the 0 that is always stored immediately before the array. */
9947             array_b--;
9948             len_b++;
9949         }
9950     }
9951
9952     /* Size the intersection for the worst case: that the intersection ends up
9953      * fragmenting everything to be completely disjoint */
9954     r= _new_invlist(len_a + len_b);
9955
9956     /* Will contain U+0000 iff both components do */
9957     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9958                                      && len_b > 0 && array_b[0] == 0);
9959
9960     /* Go through each list item by item, stopping when have exhausted one of
9961      * them */
9962     while (i_a < len_a && i_b < len_b) {
9963         UV cp;      /* The element to potentially add to the intersection's
9964                        array */
9965         bool cp_in_set; /* Is it in the input list's set or not */
9966
9967         /* We need to take one or the other of the two inputs for the
9968          * intersection.  Since we are merging two sorted lists, we take the
9969          * smaller of the next items.  In case of a tie, we take first the one
9970          * that is not in its set (a difference from the union algorithm).  If
9971          * we first took the one in its set, it would increment the count,
9972          * possibly to 2 which would cause it to be output as starting a range
9973          * in the intersection, and the next time through we would take that
9974          * same number, and output it again as ending the set.  By doing the
9975          * opposite of this, there is no possibility that the count will be
9976          * momentarily incremented to 2.  (In a tie and both are in the set or
9977          * both not in the set, it doesn't matter which we take first.) */
9978         if (       array_a[i_a] < array_b[i_b]
9979             || (   array_a[i_a] == array_b[i_b]
9980                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9981         {
9982             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9983             cp = array_a[i_a++];
9984         }
9985         else {
9986             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9987             cp= array_b[i_b++];
9988         }
9989
9990         /* Here, have chosen which of the two inputs to look at.  Only output
9991          * if the running count changes to/from 2, which marks the
9992          * beginning/end of a range that's in the intersection */
9993         if (cp_in_set) {
9994             count++;
9995             if (count == 2) {
9996                 array_r[i_r++] = cp;
9997             }
9998         }
9999         else {
10000             if (count == 2) {
10001                 array_r[i_r++] = cp;
10002             }
10003             count--;
10004         }
10005
10006     }
10007
10008     /* The loop above increments the index into exactly one of the input lists
10009      * each iteration, and ends when either index gets to its list end.  That
10010      * means the other index is lower than its end, and so something is
10011      * remaining in that one.  We increment 'count', as explained below, if the
10012      * exhausted list was in its set.  (i_a and i_b each currently index the
10013      * element beyond the one we care about.) */
10014     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10015         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10016     {
10017         count++;
10018     }
10019
10020     /* Above we incremented 'count' if the exhausted list was in its set.  This
10021      * has made it so that 'count' being below 2 means there is nothing left to
10022      * output; otheriwse what's left to add to the intersection is precisely
10023      * that which is left in the non-exhausted input list.
10024      *
10025      * To see why, note first that the exhausted input obviously has nothing
10026      * left to affect the intersection.  If it was in its set at its end, that
10027      * means the set extends from here to the platform's infinity, and hence
10028      * anything in the non-exhausted's list will be in the intersection, and
10029      * anything not in it won't be.  Hence, the rest of the intersection is
10030      * precisely what's in the non-exhausted list  The exhausted set also
10031      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
10032      * it means 'count' is now at least 2.  This is consistent with the
10033      * incremented 'count' being >= 2 means to add the non-exhausted list to
10034      * the intersection.
10035      *
10036      * But if the exhausted input wasn't in its set, it contributed 0 to
10037      * 'count', and the intersection can't include anything further; the
10038      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
10039      * incremented.  This is consistent with 'count' being < 2 meaning nothing
10040      * further to add to the intersection. */
10041     if (count < 2) { /* Nothing left to put in the intersection. */
10042         len_r = i_r;
10043     }
10044     else { /* copy the non-exhausted list, unchanged. */
10045         IV copy_count = len_a - i_a;
10046         if (copy_count > 0) {   /* a is the one with stuff left */
10047             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10048         }
10049         else {  /* b is the one with stuff left */
10050             copy_count = len_b - i_b;
10051             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10052         }
10053         len_r = i_r + copy_count;
10054     }
10055
10056     /* Set the result to the final length, which can change the pointer to
10057      * array_r, so re-find it.  (Note that it is unlikely that this will
10058      * change, as we are shrinking the space, not enlarging it) */
10059     if (len_r != _invlist_len(r)) {
10060         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10061         invlist_trim(r);
10062         array_r = invlist_array(r);
10063     }
10064
10065     if (*i == NULL) { /* Simply return the calculated intersection */
10066         *i = r;
10067     }
10068     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
10069               instead free '*i', and then set it to 'r', but experience has
10070               shown [perl #127392] that if the input is a mortal, we can get a
10071               huge build-up of these during regex compilation before they get
10072               freed. */
10073         if (len_r) {
10074             invlist_replace_list_destroys_src(*i, r);
10075         }
10076         else {
10077             invlist_clear(*i);
10078         }
10079         SvREFCNT_dec_NN(r);
10080     }
10081
10082     return;
10083 }
10084
10085 SV*
10086 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10087 {
10088     /* Add the range from 'start' to 'end' inclusive to the inversion list's
10089      * set.  A pointer to the inversion list is returned.  This may actually be
10090      * a new list, in which case the passed in one has been destroyed.  The
10091      * passed-in inversion list can be NULL, in which case a new one is created
10092      * with just the one range in it.  The new list is not necessarily
10093      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
10094      * result of this function.  The gain would not be large, and in many
10095      * cases, this is called multiple times on a single inversion list, so
10096      * anything freed may almost immediately be needed again.
10097      *
10098      * This used to mostly call the 'union' routine, but that is much more
10099      * heavyweight than really needed for a single range addition */
10100
10101     UV* array;              /* The array implementing the inversion list */
10102     UV len;                 /* How many elements in 'array' */
10103     SSize_t i_s;            /* index into the invlist array where 'start'
10104                                should go */
10105     SSize_t i_e = 0;        /* And the index where 'end' should go */
10106     UV cur_highest;         /* The highest code point in the inversion list
10107                                upon entry to this function */
10108
10109     /* This range becomes the whole inversion list if none already existed */
10110     if (invlist == NULL) {
10111         invlist = _new_invlist(2);
10112         _append_range_to_invlist(invlist, start, end);
10113         return invlist;
10114     }
10115
10116     /* Likewise, if the inversion list is currently empty */
10117     len = _invlist_len(invlist);
10118     if (len == 0) {
10119         _append_range_to_invlist(invlist, start, end);
10120         return invlist;
10121     }
10122
10123     /* Starting here, we have to know the internals of the list */
10124     array = invlist_array(invlist);
10125
10126     /* If the new range ends higher than the current highest ... */
10127     cur_highest = invlist_highest(invlist);
10128     if (end > cur_highest) {
10129
10130         /* If the whole range is higher, we can just append it */
10131         if (start > cur_highest) {
10132             _append_range_to_invlist(invlist, start, end);
10133             return invlist;
10134         }
10135
10136         /* Otherwise, add the portion that is higher ... */
10137         _append_range_to_invlist(invlist, cur_highest + 1, end);
10138
10139         /* ... and continue on below to handle the rest.  As a result of the
10140          * above append, we know that the index of the end of the range is the
10141          * final even numbered one of the array.  Recall that the final element
10142          * always starts a range that extends to infinity.  If that range is in
10143          * the set (meaning the set goes from here to infinity), it will be an
10144          * even index, but if it isn't in the set, it's odd, and the final
10145          * range in the set is one less, which is even. */
10146         if (end == UV_MAX) {
10147             i_e = len;
10148         }
10149         else {
10150             i_e = len - 2;
10151         }
10152     }
10153
10154     /* We have dealt with appending, now see about prepending.  If the new
10155      * range starts lower than the current lowest ... */
10156     if (start < array[0]) {
10157
10158         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10159          * Let the union code handle it, rather than having to know the
10160          * trickiness in two code places.  */
10161         if (UNLIKELY(start == 0)) {
10162             SV* range_invlist;
10163
10164             range_invlist = _new_invlist(2);
10165             _append_range_to_invlist(range_invlist, start, end);
10166
10167             _invlist_union(invlist, range_invlist, &invlist);
10168
10169             SvREFCNT_dec_NN(range_invlist);
10170
10171             return invlist;
10172         }
10173
10174         /* If the whole new range comes before the first entry, and doesn't
10175          * extend it, we have to insert it as an additional range */
10176         if (end < array[0] - 1) {
10177             i_s = i_e = -1;
10178             goto splice_in_new_range;
10179         }
10180
10181         /* Here the new range adjoins the existing first range, extending it
10182          * downwards. */
10183         array[0] = start;
10184
10185         /* And continue on below to handle the rest.  We know that the index of
10186          * the beginning of the range is the first one of the array */
10187         i_s = 0;
10188     }
10189     else { /* Not prepending any part of the new range to the existing list.
10190             * Find where in the list it should go.  This finds i_s, such that:
10191             *     invlist[i_s] <= start < array[i_s+1]
10192             */
10193         i_s = _invlist_search(invlist, start);
10194     }
10195
10196     /* At this point, any extending before the beginning of the inversion list
10197      * and/or after the end has been done.  This has made it so that, in the
10198      * code below, each endpoint of the new range is either in a range that is
10199      * in the set, or is in a gap between two ranges that are.  This means we
10200      * don't have to worry about exceeding the array bounds.
10201      *
10202      * Find where in the list the new range ends (but we can skip this if we
10203      * have already determined what it is, or if it will be the same as i_s,
10204      * which we already have computed) */
10205     if (i_e == 0) {
10206         i_e = (start == end)
10207               ? i_s
10208               : _invlist_search(invlist, end);
10209     }
10210
10211     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10212      * is a range that goes to infinity there is no element at invlist[i_e+1],
10213      * so only the first relation holds. */
10214
10215     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10216
10217         /* Here, the ranges on either side of the beginning of the new range
10218          * are in the set, and this range starts in the gap between them.
10219          *
10220          * The new range extends the range above it downwards if the new range
10221          * ends at or above that range's start */
10222         const bool extends_the_range_above = (   end == UV_MAX
10223                                               || end + 1 >= array[i_s+1]);
10224
10225         /* The new range extends the range below it upwards if it begins just
10226          * after where that range ends */
10227         if (start == array[i_s]) {
10228
10229             /* If the new range fills the entire gap between the other ranges,
10230              * they will get merged together.  Other ranges may also get
10231              * merged, depending on how many of them the new range spans.  In
10232              * the general case, we do the merge later, just once, after we
10233              * figure out how many to merge.  But in the case where the new
10234              * range exactly spans just this one gap (possibly extending into
10235              * the one above), we do the merge here, and an early exit.  This
10236              * is done here to avoid having to special case later. */
10237             if (i_e - i_s <= 1) {
10238
10239                 /* If i_e - i_s == 1, it means that the new range terminates
10240                  * within the range above, and hence 'extends_the_range_above'
10241                  * must be true.  (If the range above it extends to infinity,
10242                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10243                  * will be 0, so no harm done.) */
10244                 if (extends_the_range_above) {
10245                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10246                     invlist_set_len(invlist,
10247                                     len - 2,
10248                                     *(get_invlist_offset_addr(invlist)));
10249                     return invlist;
10250                 }
10251
10252                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10253                  * to the same range, and below we are about to decrement i_s
10254                  * */
10255                 i_e--;
10256             }
10257
10258             /* Here, the new range is adjacent to the one below.  (It may also
10259              * span beyond the range above, but that will get resolved later.)
10260              * Extend the range below to include this one. */
10261             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10262             i_s--;
10263             start = array[i_s];
10264         }
10265         else if (extends_the_range_above) {
10266
10267             /* Here the new range only extends the range above it, but not the
10268              * one below.  It merges with the one above.  Again, we keep i_e
10269              * and i_s in sync if they point to the same range */
10270             if (i_e == i_s) {
10271                 i_e++;
10272             }
10273             i_s++;
10274             array[i_s] = start;
10275         }
10276     }
10277
10278     /* Here, we've dealt with the new range start extending any adjoining
10279      * existing ranges.
10280      *
10281      * If the new range extends to infinity, it is now the final one,
10282      * regardless of what was there before */
10283     if (UNLIKELY(end == UV_MAX)) {
10284         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10285         return invlist;
10286     }
10287
10288     /* If i_e started as == i_s, it has also been dealt with,
10289      * and been updated to the new i_s, which will fail the following if */
10290     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10291
10292         /* Here, the ranges on either side of the end of the new range are in
10293          * the set, and this range ends in the gap between them.
10294          *
10295          * If this range is adjacent to (hence extends) the range above it, it
10296          * becomes part of that range; likewise if it extends the range below,
10297          * it becomes part of that range */
10298         if (end + 1 == array[i_e+1]) {
10299             i_e++;
10300             array[i_e] = start;
10301         }
10302         else if (start <= array[i_e]) {
10303             array[i_e] = end + 1;
10304             i_e--;
10305         }
10306     }
10307
10308     if (i_s == i_e) {
10309
10310         /* If the range fits entirely in an existing range (as possibly already
10311          * extended above), it doesn't add anything new */
10312         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10313             return invlist;
10314         }
10315
10316         /* Here, no part of the range is in the list.  Must add it.  It will
10317          * occupy 2 more slots */
10318       splice_in_new_range:
10319
10320         invlist_extend(invlist, len + 2);
10321         array = invlist_array(invlist);
10322         /* Move the rest of the array down two slots. Don't include any
10323          * trailing NUL */
10324         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10325
10326         /* Do the actual splice */
10327         array[i_e+1] = start;
10328         array[i_e+2] = end + 1;
10329         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10330         return invlist;
10331     }
10332
10333     /* Here the new range crossed the boundaries of a pre-existing range.  The
10334      * code above has adjusted things so that both ends are in ranges that are
10335      * in the set.  This means everything in between must also be in the set.
10336      * Just squash things together */
10337     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10338     invlist_set_len(invlist,
10339                     len - i_e + i_s,
10340                     *(get_invlist_offset_addr(invlist)));
10341
10342     return invlist;
10343 }
10344
10345 SV*
10346 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10347                                  UV** other_elements_ptr)
10348 {
10349     /* Create and return an inversion list whose contents are to be populated
10350      * by the caller.  The caller gives the number of elements (in 'size') and
10351      * the very first element ('element0').  This function will set
10352      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10353      * are to be placed.
10354      *
10355      * Obviously there is some trust involved that the caller will properly
10356      * fill in the other elements of the array.
10357      *
10358      * (The first element needs to be passed in, as the underlying code does
10359      * things differently depending on whether it is zero or non-zero) */
10360
10361     SV* invlist = _new_invlist(size);
10362     bool offset;
10363
10364     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10365
10366     invlist = add_cp_to_invlist(invlist, element0);
10367     offset = *get_invlist_offset_addr(invlist);
10368
10369     invlist_set_len(invlist, size, offset);
10370     *other_elements_ptr = invlist_array(invlist) + 1;
10371     return invlist;
10372 }
10373
10374 #endif
10375
10376 #ifndef PERL_IN_XSUB_RE
10377 void
10378 Perl__invlist_invert(pTHX_ SV* const invlist)
10379 {
10380     /* Complement the input inversion list.  This adds a 0 if the list didn't
10381      * have a zero; removes it otherwise.  As described above, the data
10382      * structure is set up so that this is very efficient */
10383
10384     PERL_ARGS_ASSERT__INVLIST_INVERT;
10385
10386     assert(! invlist_is_iterating(invlist));
10387
10388     /* The inverse of matching nothing is matching everything */
10389     if (_invlist_len(invlist) == 0) {
10390         _append_range_to_invlist(invlist, 0, UV_MAX);
10391         return;
10392     }
10393
10394     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10395 }
10396
10397 SV*
10398 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10399 {
10400     /* Return a new inversion list that is a copy of the input one, which is
10401      * unchanged.  The new list will not be mortal even if the old one was. */
10402
10403     const STRLEN nominal_length = _invlist_len(invlist);
10404     const STRLEN physical_length = SvCUR(invlist);
10405     const bool offset = *(get_invlist_offset_addr(invlist));
10406
10407     PERL_ARGS_ASSERT_INVLIST_CLONE;
10408
10409     if (new_invlist == NULL) {
10410         new_invlist = _new_invlist(nominal_length);
10411     }
10412     else {
10413         sv_upgrade(new_invlist, SVt_INVLIST);
10414         initialize_invlist_guts(new_invlist, nominal_length);
10415     }
10416
10417     *(get_invlist_offset_addr(new_invlist)) = offset;
10418     invlist_set_len(new_invlist, nominal_length, offset);
10419     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10420
10421     return new_invlist;
10422 }
10423
10424 #endif
10425
10426 PERL_STATIC_INLINE UV
10427 S_invlist_lowest(SV* const invlist)
10428 {
10429     /* Returns the lowest code point that matches an inversion list.  This API
10430      * has an ambiguity, as it returns 0 under either the lowest is actually
10431      * 0, or if the list is empty.  If this distinction matters to you, check
10432      * for emptiness before calling this function */
10433
10434     UV len = _invlist_len(invlist);
10435     UV *array;
10436
10437     PERL_ARGS_ASSERT_INVLIST_LOWEST;
10438
10439     if (len == 0) {
10440         return 0;
10441     }
10442
10443     array = invlist_array(invlist);
10444
10445     return array[0];
10446 }
10447
10448 STATIC SV *
10449 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10450 {
10451     /* Get the contents of an inversion list into a string SV so that they can
10452      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10453      * traditionally done for debug tracing; otherwise it uses a format
10454      * suitable for just copying to the output, with blanks between ranges and
10455      * a dash between range components */
10456
10457     UV start, end;
10458     SV* output;
10459     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10460     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10461
10462     if (traditional_style) {
10463         output = newSVpvs("\n");
10464     }
10465     else {
10466         output = newSVpvs("");
10467     }
10468
10469     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10470
10471     assert(! invlist_is_iterating(invlist));
10472
10473     invlist_iterinit(invlist);
10474     while (invlist_iternext(invlist, &start, &end)) {
10475         if (end == UV_MAX) {
10476             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10477                                           start, intra_range_delimiter,
10478                                                  inter_range_delimiter);
10479         }
10480         else if (end != start) {
10481             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10482                                           start,
10483                                                    intra_range_delimiter,
10484                                                   end, inter_range_delimiter);
10485         }
10486         else {
10487             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10488                                           start, inter_range_delimiter);
10489         }
10490     }
10491
10492     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10493         SvCUR_set(output, SvCUR(output) - 1);
10494     }
10495
10496     return output;
10497 }
10498
10499 #ifndef PERL_IN_XSUB_RE
10500 void
10501 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10502                          const char * const indent, SV* const invlist)
10503 {
10504     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10505      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10506      * the string 'indent'.  The output looks like this:
10507          [0] 0x000A .. 0x000D
10508          [2] 0x0085
10509          [4] 0x2028 .. 0x2029
10510          [6] 0x3104 .. INFTY
10511      * This means that the first range of code points matched by the list are
10512      * 0xA through 0xD; the second range contains only the single code point
10513      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10514      * are used to define each range (except if the final range extends to
10515      * infinity, only a single element is needed).  The array index of the
10516      * first element for the corresponding range is given in brackets. */
10517
10518     UV start, end;
10519     STRLEN count = 0;
10520
10521     PERL_ARGS_ASSERT__INVLIST_DUMP;
10522
10523     if (invlist_is_iterating(invlist)) {
10524         Perl_dump_indent(aTHX_ level, file,
10525              "%sCan't dump inversion list because is in middle of iterating\n",
10526              indent);
10527         return;
10528     }
10529
10530     invlist_iterinit(invlist);
10531     while (invlist_iternext(invlist, &start, &end)) {
10532         if (end == UV_MAX) {
10533             Perl_dump_indent(aTHX_ level, file,
10534                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10535                                    indent, (UV)count, start);
10536         }
10537         else if (end != start) {
10538             Perl_dump_indent(aTHX_ level, file,
10539                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10540                                 indent, (UV)count, start,         end);
10541         }
10542         else {
10543             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10544                                             indent, (UV)count, start);
10545         }
10546         count += 2;
10547     }
10548 }
10549
10550 #endif
10551
10552 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10553 bool
10554 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10555 {
10556     /* Return a boolean as to if the two passed in inversion lists are
10557      * identical.  The final argument, if TRUE, says to take the complement of
10558      * the second inversion list before doing the comparison */
10559
10560     const UV len_a = _invlist_len(a);
10561     UV len_b = _invlist_len(b);
10562
10563     const UV* array_a = NULL;
10564     const UV* array_b = NULL;
10565
10566     PERL_ARGS_ASSERT__INVLISTEQ;
10567
10568     /* This code avoids accessing the arrays unless it knows the length is
10569      * non-zero */
10570
10571     if (len_a == 0) {
10572         if (len_b == 0) {
10573             return ! complement_b;
10574         }
10575     }
10576     else {
10577         array_a = invlist_array(a);
10578     }
10579
10580     if (len_b != 0) {
10581         array_b = invlist_array(b);
10582     }
10583
10584     /* If are to compare 'a' with the complement of b, set it
10585      * up so are looking at b's complement. */
10586     if (complement_b) {
10587
10588         /* The complement of nothing is everything, so <a> would have to have
10589          * just one element, starting at zero (ending at infinity) */
10590         if (len_b == 0) {
10591             return (len_a == 1 && array_a[0] == 0);
10592         }
10593         if (array_b[0] == 0) {
10594
10595             /* Otherwise, to complement, we invert.  Here, the first element is
10596              * 0, just remove it.  To do this, we just pretend the array starts
10597              * one later */
10598
10599             array_b++;
10600             len_b--;
10601         }
10602         else {
10603
10604             /* But if the first element is not zero, we pretend the list starts
10605              * at the 0 that is always stored immediately before the array. */
10606             array_b--;
10607             len_b++;
10608         }
10609     }
10610
10611     return    len_a == len_b
10612            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10613
10614 }
10615 #endif
10616
10617 /*
10618  * As best we can, determine the characters that can match the start of
10619  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10620  * can be false positive matches
10621  *
10622  * Returns the invlist as a new SV*; it is the caller's responsibility to
10623  * call SvREFCNT_dec() when done with it.
10624  */
10625 STATIC SV*
10626 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10627 {
10628     const U8 * s = (U8*)STRING(node);
10629     SSize_t bytelen = STR_LEN(node);
10630     UV uc;
10631     /* Start out big enough for 2 separate code points */
10632     SV* invlist = _new_invlist(4);
10633
10634     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10635
10636     if (! UTF) {
10637         uc = *s;
10638
10639         /* We punt and assume can match anything if the node begins
10640          * with a multi-character fold.  Things are complicated.  For
10641          * example, /ffi/i could match any of:
10642          *  "\N{LATIN SMALL LIGATURE FFI}"
10643          *  "\N{LATIN SMALL LIGATURE FF}I"
10644          *  "F\N{LATIN SMALL LIGATURE FI}"
10645          *  plus several other things; and making sure we have all the
10646          *  possibilities is hard. */
10647         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10648             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10649         }
10650         else {
10651             /* Any Latin1 range character can potentially match any
10652              * other depending on the locale, and in Turkic locales, U+130 and
10653              * U+131 */
10654             if (OP(node) == EXACTFL) {
10655                 _invlist_union(invlist, PL_Latin1, &invlist);
10656                 invlist = add_cp_to_invlist(invlist,
10657                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10658                 invlist = add_cp_to_invlist(invlist,
10659                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10660             }
10661             else {
10662                 /* But otherwise, it matches at least itself.  We can
10663                  * quickly tell if it has a distinct fold, and if so,
10664                  * it matches that as well */
10665                 invlist = add_cp_to_invlist(invlist, uc);
10666                 if (IS_IN_SOME_FOLD_L1(uc))
10667                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10668             }
10669
10670             /* Some characters match above-Latin1 ones under /i.  This
10671              * is true of EXACTFL ones when the locale is UTF-8 */
10672             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10673                 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
10674                                                          EXACTFAA_NO_TRIE)))
10675             {
10676                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10677             }
10678         }
10679     }
10680     else {  /* Pattern is UTF-8 */
10681         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10682         const U8* e = s + bytelen;
10683         IV fc;
10684
10685         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10686
10687         /* The only code points that aren't folded in a UTF EXACTFish
10688          * node are the problematic ones in EXACTFL nodes */
10689         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10690             /* We need to check for the possibility that this EXACTFL
10691              * node begins with a multi-char fold.  Therefore we fold
10692              * the first few characters of it so that we can make that
10693              * check */
10694             U8 *d = folded;
10695             int i;
10696
10697             fc = -1;
10698             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10699                 if (isASCII(*s)) {
10700                     *(d++) = (U8) toFOLD(*s);
10701                     if (fc < 0) {       /* Save the first fold */
10702                         fc = *(d-1);
10703                     }
10704                     s++;
10705                 }
10706                 else {
10707                     STRLEN len;
10708                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10709                     if (fc < 0) {       /* Save the first fold */
10710                         fc = fold;
10711                     }
10712                     d += len;
10713                     s += UTF8SKIP(s);
10714                 }
10715             }
10716
10717             /* And set up so the code below that looks in this folded
10718              * buffer instead of the node's string */
10719             e = d;
10720             s = folded;
10721         }
10722
10723         /* When we reach here 's' points to the fold of the first
10724          * character(s) of the node; and 'e' points to far enough along
10725          * the folded string to be just past any possible multi-char
10726          * fold.
10727          *
10728          * Like the non-UTF case above, we punt if the node begins with a
10729          * multi-char fold  */
10730
10731         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10732             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10733         }
10734         else {  /* Single char fold */
10735             unsigned int k;
10736             U32 first_fold;
10737             const U32 * remaining_folds;
10738             Size_t folds_count;
10739
10740             /* It matches itself */
10741             invlist = add_cp_to_invlist(invlist, fc);
10742
10743             /* ... plus all the things that fold to it, which are found in
10744              * PL_utf8_foldclosures */
10745             folds_count = _inverse_folds(fc, &first_fold,
10746                                                 &remaining_folds);
10747             for (k = 0; k < folds_count; k++) {
10748                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10749
10750                 /* /aa doesn't allow folds between ASCII and non- */
10751                 if (   inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
10752                     && isASCII(c) != isASCII(fc))
10753                 {
10754                     continue;
10755                 }
10756
10757                 invlist = add_cp_to_invlist(invlist, c);
10758             }
10759
10760             if (OP(node) == EXACTFL) {
10761
10762                 /* If either [iI] are present in an EXACTFL node the above code
10763                  * should have added its normal case pair, but under a Turkish
10764                  * locale they could match instead the case pairs from it.  Add
10765                  * those as potential matches as well */
10766                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10767                     invlist = add_cp_to_invlist(invlist,
10768                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10769                     invlist = add_cp_to_invlist(invlist,
10770                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10771                 }
10772                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10773                     invlist = add_cp_to_invlist(invlist, 'I');
10774                 }
10775                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10776                     invlist = add_cp_to_invlist(invlist, 'i');
10777                 }
10778             }
10779         }
10780     }
10781
10782     return invlist;
10783 }
10784
10785 #undef HEADER_LENGTH
10786 #undef TO_INTERNAL_SIZE
10787 #undef FROM_INTERNAL_SIZE
10788 #undef INVLIST_VERSION_ID
10789
10790 /* End of inversion list object */
10791
10792 STATIC void
10793 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10794 {
10795     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10796      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10797      * should point to the first flag; it is updated on output to point to the
10798      * final ')' or ':'.  There needs to be at least one flag, or this will
10799      * abort */
10800
10801     /* for (?g), (?gc), and (?o) warnings; warning
10802        about (?c) will warn about (?g) -- japhy    */
10803
10804 #define WASTED_O  0x01
10805 #define WASTED_G  0x02
10806 #define WASTED_C  0x04
10807 #define WASTED_GC (WASTED_G|WASTED_C)
10808     I32 wastedflags = 0x00;
10809     U32 posflags = 0, negflags = 0;
10810     U32 *flagsp = &posflags;
10811     char has_charset_modifier = '\0';
10812     regex_charset cs;
10813     bool has_use_defaults = FALSE;
10814     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10815     int x_mod_count = 0;
10816
10817     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10818
10819     /* '^' as an initial flag sets certain defaults */
10820     if (UCHARAT(RExC_parse) == '^') {
10821         RExC_parse++;
10822         has_use_defaults = TRUE;
10823         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10824         cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10825              ? REGEX_UNICODE_CHARSET
10826              : REGEX_DEPENDS_CHARSET;
10827         set_regex_charset(&RExC_flags, cs);
10828     }
10829     else {
10830         cs = get_regex_charset(RExC_flags);
10831         if (   cs == REGEX_DEPENDS_CHARSET
10832             && (toUSE_UNI_CHARSET_NOT_DEPENDS))
10833         {
10834             cs = REGEX_UNICODE_CHARSET;
10835         }
10836     }
10837
10838     while (RExC_parse < RExC_end) {
10839         /* && memCHRs("iogcmsx", *RExC_parse) */
10840         /* (?g), (?gc) and (?o) are useless here
10841            and must be globally applied -- japhy */
10842         if ((RExC_pm_flags & PMf_WILDCARD)) {
10843             if (flagsp == & negflags) {
10844                 if (*RExC_parse == 'm') {
10845                     RExC_parse++;
10846                     /* diag_listed_as: Use of %s is not allowed in Unicode
10847                        property wildcard subpatterns in regex; marked by <--
10848                        HERE in m/%s/ */
10849                     vFAIL("Use of modifier '-m' is not allowed in Unicode"
10850                           " property wildcard subpatterns");
10851                 }
10852             }
10853             else {
10854                 if (*RExC_parse == 's') {
10855                     goto modifier_illegal_in_wildcard;
10856                 }
10857             }
10858         }
10859
10860         switch (*RExC_parse) {
10861
10862             /* Code for the imsxn flags */
10863             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10864
10865             case LOCALE_PAT_MOD:
10866                 if (has_charset_modifier) {
10867                     goto excess_modifier;
10868                 }
10869                 else if (flagsp == &negflags) {
10870                     goto neg_modifier;
10871                 }
10872                 cs = REGEX_LOCALE_CHARSET;
10873                 has_charset_modifier = LOCALE_PAT_MOD;
10874                 break;
10875             case UNICODE_PAT_MOD:
10876                 if (has_charset_modifier) {
10877                     goto excess_modifier;
10878                 }
10879                 else if (flagsp == &negflags) {
10880                     goto neg_modifier;
10881                 }
10882                 cs = REGEX_UNICODE_CHARSET;
10883                 has_charset_modifier = UNICODE_PAT_MOD;
10884                 break;
10885             case ASCII_RESTRICT_PAT_MOD:
10886                 if (flagsp == &negflags) {
10887                     goto neg_modifier;
10888                 }
10889                 if (has_charset_modifier) {
10890                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10891                         goto excess_modifier;
10892                     }
10893                     /* Doubled modifier implies more restricted */
10894                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10895                 }
10896                 else {
10897                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10898                 }
10899                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10900                 break;
10901             case DEPENDS_PAT_MOD:
10902                 if (has_use_defaults) {
10903                     goto fail_modifiers;
10904                 }
10905                 else if (flagsp == &negflags) {
10906                     goto neg_modifier;
10907                 }
10908                 else if (has_charset_modifier) {
10909                     goto excess_modifier;
10910                 }
10911
10912                 /* The dual charset means unicode semantics if the
10913                  * pattern (or target, not known until runtime) are
10914                  * utf8, or something in the pattern indicates unicode
10915                  * semantics */
10916                 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10917                      ? REGEX_UNICODE_CHARSET
10918                      : REGEX_DEPENDS_CHARSET;
10919                 has_charset_modifier = DEPENDS_PAT_MOD;
10920                 break;
10921               excess_modifier:
10922                 RExC_parse++;
10923                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10924                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10925                 }
10926                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10927                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10928                                         *(RExC_parse - 1));
10929                 }
10930                 else {
10931                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10932                 }
10933                 NOT_REACHED; /*NOTREACHED*/
10934               neg_modifier:
10935                 RExC_parse++;
10936                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10937                                     *(RExC_parse - 1));
10938                 NOT_REACHED; /*NOTREACHED*/
10939             case GLOBAL_PAT_MOD: /* 'g' */
10940                 if (RExC_pm_flags & PMf_WILDCARD) {
10941                     goto modifier_illegal_in_wildcard;
10942                 }
10943                 /*FALLTHROUGH*/
10944             case ONCE_PAT_MOD: /* 'o' */
10945                 if (ckWARN(WARN_REGEXP)) {
10946                     const I32 wflagbit = *RExC_parse == 'o'
10947                                          ? WASTED_O
10948                                          : WASTED_G;
10949                     if (! (wastedflags & wflagbit) ) {
10950                         wastedflags |= wflagbit;
10951                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10952                         vWARN5(
10953                             RExC_parse + 1,
10954                             "Useless (%s%c) - %suse /%c modifier",
10955                             flagsp == &negflags ? "?-" : "?",
10956                             *RExC_parse,
10957                             flagsp == &negflags ? "don't " : "",
10958                             *RExC_parse
10959                         );
10960                     }
10961                 }
10962                 break;
10963
10964             case CONTINUE_PAT_MOD: /* 'c' */
10965                 if (RExC_pm_flags & PMf_WILDCARD) {
10966                     goto modifier_illegal_in_wildcard;
10967                 }
10968                 if (ckWARN(WARN_REGEXP)) {
10969                     if (! (wastedflags & WASTED_C) ) {
10970                         wastedflags |= WASTED_GC;
10971                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10972                         vWARN3(
10973                             RExC_parse + 1,
10974                             "Useless (%sc) - %suse /gc modifier",
10975                             flagsp == &negflags ? "?-" : "?",
10976                             flagsp == &negflags ? "don't " : ""
10977                         );
10978                     }
10979                 }
10980                 break;
10981             case KEEPCOPY_PAT_MOD: /* 'p' */
10982                 if (RExC_pm_flags & PMf_WILDCARD) {
10983                     goto modifier_illegal_in_wildcard;
10984                 }
10985                 if (flagsp == &negflags) {
10986                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10987                 } else {
10988                     *flagsp |= RXf_PMf_KEEPCOPY;
10989                 }
10990                 break;
10991             case '-':
10992                 /* A flag is a default iff it is following a minus, so
10993                  * if there is a minus, it means will be trying to
10994                  * re-specify a default which is an error */
10995                 if (has_use_defaults || flagsp == &negflags) {
10996                     goto fail_modifiers;
10997                 }
10998                 flagsp = &negflags;
10999                 wastedflags = 0;  /* reset so (?g-c) warns twice */
11000                 x_mod_count = 0;
11001                 break;
11002             case ':':
11003             case ')':
11004
11005                 if (  (RExC_pm_flags & PMf_WILDCARD)
11006                     && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11007                 {
11008                     RExC_parse++;
11009                     /* diag_listed_as: Use of %s is not allowed in Unicode
11010                        property wildcard subpatterns in regex; marked by <--
11011                        HERE in m/%s/ */
11012                     vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11013                            " property wildcard subpatterns",
11014                            has_charset_modifier);
11015                 }
11016
11017                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11018                     negflags |= RXf_PMf_EXTENDED_MORE;
11019                 }
11020                 RExC_flags |= posflags;
11021
11022                 if (negflags & RXf_PMf_EXTENDED) {
11023                     negflags |= RXf_PMf_EXTENDED_MORE;
11024                 }
11025                 RExC_flags &= ~negflags;
11026                 set_regex_charset(&RExC_flags, cs);
11027
11028                 return;
11029             default:
11030               fail_modifiers:
11031                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11032                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11033                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11034                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11035                 NOT_REACHED; /*NOTREACHED*/
11036         }
11037
11038         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11039     }
11040
11041     vFAIL("Sequence (?... not terminated");
11042
11043   modifier_illegal_in_wildcard:
11044     RExC_parse++;
11045     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11046        subpatterns in regex; marked by <-- HERE in m/%s/ */
11047     vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11048            " subpatterns", *(RExC_parse - 1));
11049 }
11050
11051 /*
11052  - reg - regular expression, i.e. main body or parenthesized thing
11053  *
11054  * Caller must absorb opening parenthesis.
11055  *
11056  * Combining parenthesis handling with the base level of regular expression
11057  * is a trifle forced, but the need to tie the tails of the branches to what
11058  * follows makes it hard to avoid.
11059  */
11060 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11061 #ifdef DEBUGGING
11062 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11063 #else
11064 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11065 #endif
11066
11067 STATIC regnode_offset
11068 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11069                              I32 *flagp,
11070                              char * parse_start,
11071                              char ch
11072                       )
11073 {
11074     regnode_offset ret;
11075     char* name_start = RExC_parse;
11076     U32 num = 0;
11077     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11078     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11079
11080     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11081
11082     if (RExC_parse == name_start || *RExC_parse != ch) {
11083         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11084         vFAIL2("Sequence %.3s... not terminated", parse_start);
11085     }
11086
11087     if (sv_dat) {
11088         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11089         RExC_rxi->data->data[num]=(void*)sv_dat;
11090         SvREFCNT_inc_simple_void_NN(sv_dat);
11091     }
11092     RExC_sawback = 1;
11093     ret = reganode(pRExC_state,
11094                    ((! FOLD)
11095                      ? REFN
11096                      : (ASCII_FOLD_RESTRICTED)
11097                        ? REFFAN
11098                        : (AT_LEAST_UNI_SEMANTICS)
11099                          ? REFFUN
11100                          : (LOC)
11101                            ? REFFLN
11102                            : REFFN),
11103                     num);
11104     *flagp |= HASWIDTH;
11105
11106     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11107     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11108
11109     nextchar(pRExC_state);
11110     return ret;
11111 }
11112
11113 /* On success, returns the offset at which any next node should be placed into
11114  * the regex engine program being compiled.
11115  *
11116  * Returns 0 otherwise, with *flagp set to indicate why:
11117  *  TRYAGAIN        at the end of (?) that only sets flags.
11118  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11119  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11120  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11121  *  happen.  */
11122 STATIC regnode_offset
11123 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11124     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11125      * 2 is like 1, but indicates that nextchar() has been called to advance
11126      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11127      * this flag alerts us to the need to check for that */
11128 {
11129     regnode_offset ret = 0;    /* Will be the head of the group. */
11130     regnode_offset br;
11131     regnode_offset lastbr;
11132     regnode_offset ender = 0;
11133     I32 parno = 0;
11134     I32 flags;
11135     U32 oregflags = RExC_flags;
11136     bool have_branch = 0;
11137     bool is_open = 0;
11138     I32 freeze_paren = 0;
11139     I32 after_freeze = 0;
11140     I32 num; /* numeric backreferences */
11141     SV * max_open;  /* Max number of unclosed parens */
11142     I32 was_in_lookaround = RExC_in_lookaround;
11143
11144     char * parse_start = RExC_parse; /* MJD */
11145     char * const oregcomp_parse = RExC_parse;
11146
11147     DECLARE_AND_GET_RE_DEBUG_FLAGS;
11148
11149     PERL_ARGS_ASSERT_REG;
11150     DEBUG_PARSE("reg ");
11151
11152     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11153     assert(max_open);
11154     if (!SvIOK(max_open)) {
11155         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11156     }
11157     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11158                                               open paren */
11159         vFAIL("Too many nested open parens");
11160     }
11161
11162     *flagp = 0;                         /* Initialize. */
11163
11164     /* Having this true makes it feasible to have a lot fewer tests for the
11165      * parse pointer being in scope.  For example, we can write
11166      *      while(isFOO(*RExC_parse)) RExC_parse++;
11167      * instead of
11168      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11169      */
11170     assert(*RExC_end == '\0');
11171
11172     /* Make an OPEN node, if parenthesized. */
11173     if (paren) {
11174
11175         /* Under /x, space and comments can be gobbled up between the '(' and
11176          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11177          * intervening space, as the sequence is a token, and a token should be
11178          * indivisible */
11179         bool has_intervening_patws = (paren == 2)
11180                                   && *(RExC_parse - 1) != '(';
11181
11182         if (RExC_parse >= RExC_end) {
11183             vFAIL("Unmatched (");
11184         }
11185
11186         if (paren == 'r') {     /* Atomic script run */
11187             paren = '>';
11188             goto parse_rest;
11189         }
11190         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11191             char *start_verb = RExC_parse + 1;
11192             STRLEN verb_len;
11193             char *start_arg = NULL;
11194             unsigned char op = 0;
11195             int arg_required = 0;
11196             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11197             bool has_upper = FALSE;
11198
11199             if (has_intervening_patws) {
11200                 RExC_parse++;   /* past the '*' */
11201
11202                 /* For strict backwards compatibility, don't change the message
11203                  * now that we also have lowercase operands */
11204                 if (isUPPER(*RExC_parse)) {
11205                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11206                 }
11207                 else {
11208                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11209                 }
11210             }
11211             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11212                 if ( *RExC_parse == ':' ) {
11213                     start_arg = RExC_parse + 1;
11214                     break;
11215                 }
11216                 else if (! UTF) {
11217                     if (isUPPER(*RExC_parse)) {
11218                         has_upper = TRUE;
11219                     }
11220                     RExC_parse++;
11221                 }
11222                 else {
11223                     RExC_parse += UTF8SKIP(RExC_parse);
11224                 }
11225             }
11226             verb_len = RExC_parse - start_verb;
11227             if ( start_arg ) {
11228                 if (RExC_parse >= RExC_end) {
11229                     goto unterminated_verb_pattern;
11230                 }
11231
11232                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11233                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11234                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11235                 }
11236                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11237                   unterminated_verb_pattern:
11238                     if (has_upper) {
11239                         vFAIL("Unterminated verb pattern argument");
11240                     }
11241                     else {
11242                         vFAIL("Unterminated '(*...' argument");
11243                     }
11244                 }
11245             } else {
11246                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11247                     if (has_upper) {
11248                         vFAIL("Unterminated verb pattern");
11249                     }
11250                     else {
11251                         vFAIL("Unterminated '(*...' construct");
11252                     }
11253                 }
11254             }
11255
11256             /* Here, we know that RExC_parse < RExC_end */
11257
11258             switch ( *start_verb ) {
11259             case 'A':  /* (*ACCEPT) */
11260                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11261                     op = ACCEPT;
11262                     internal_argval = RExC_nestroot;
11263                 }
11264                 break;
11265             case 'C':  /* (*COMMIT) */
11266                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11267                     op = COMMIT;
11268                 break;
11269             case 'F':  /* (*FAIL) */
11270                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11271                     op = OPFAIL;
11272                 }
11273                 break;
11274             case ':':  /* (*:NAME) */
11275             case 'M':  /* (*MARK:NAME) */
11276                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11277                     op = MARKPOINT;
11278                     arg_required = 1;
11279                 }
11280                 break;
11281             case 'P':  /* (*PRUNE) */
11282                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11283                     op = PRUNE;
11284                 break;
11285             case 'S':   /* (*SKIP) */
11286                 if ( memEQs(start_verb, verb_len,"SKIP") )
11287                     op = SKIP;
11288                 break;
11289             case 'T':  /* (*THEN) */
11290                 /* [19:06] <TimToady> :: is then */
11291                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11292                     op = CUTGROUP;
11293                     RExC_seen |= REG_CUTGROUP_SEEN;
11294                 }
11295                 break;
11296             case 'a':
11297                 if (   memEQs(start_verb, verb_len, "asr")
11298                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11299                 {
11300                     paren = 'r';        /* Mnemonic: recursed run */
11301                     goto script_run;
11302                 }
11303                 else if (memEQs(start_verb, verb_len, "atomic")) {
11304                     paren = 't';    /* AtOMIC */
11305                     goto alpha_assertions;
11306                 }
11307                 break;
11308             case 'p':
11309                 if (   memEQs(start_verb, verb_len, "plb")
11310                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11311                 {
11312                     paren = 'b';
11313                     goto lookbehind_alpha_assertions;
11314                 }
11315                 else if (   memEQs(start_verb, verb_len, "pla")
11316                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11317                 {
11318                     paren = 'a';
11319                     goto alpha_assertions;
11320                 }
11321                 break;
11322             case 'n':
11323                 if (   memEQs(start_verb, verb_len, "nlb")
11324                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11325                 {
11326                     paren = 'B';
11327                     goto lookbehind_alpha_assertions;
11328                 }
11329                 else if (   memEQs(start_verb, verb_len, "nla")
11330                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11331                 {
11332                     paren = 'A';
11333                     goto alpha_assertions;
11334                 }
11335                 break;
11336             case 's':
11337                 if (   memEQs(start_verb, verb_len, "sr")
11338                     || memEQs(start_verb, verb_len, "script_run"))
11339                 {
11340                     regnode_offset atomic;
11341
11342                     paren = 's';
11343
11344                    script_run:
11345
11346                     /* This indicates Unicode rules. */
11347                     REQUIRE_UNI_RULES(flagp, 0);
11348
11349                     if (! start_arg) {
11350                         goto no_colon;
11351                     }
11352
11353                     RExC_parse = start_arg;
11354
11355                     if (RExC_in_script_run) {
11356
11357                         /*  Nested script runs are treated as no-ops, because
11358                          *  if the nested one fails, the outer one must as
11359                          *  well.  It could fail sooner, and avoid (??{} with
11360                          *  side effects, but that is explicitly documented as
11361                          *  undefined behavior. */
11362
11363                         ret = 0;
11364
11365                         if (paren == 's') {
11366                             paren = ':';
11367                             goto parse_rest;
11368                         }
11369
11370                         /* But, the atomic part of a nested atomic script run
11371                          * isn't a no-op, but can be treated just like a '(?>'
11372                          * */
11373                         paren = '>';
11374                         goto parse_rest;
11375                     }
11376
11377                     if (paren == 's') {
11378                         /* Here, we're starting a new regular script run */
11379                         ret = reg_node(pRExC_state, SROPEN);
11380                         RExC_in_script_run = 1;
11381                         is_open = 1;
11382                         goto parse_rest;
11383                     }
11384
11385                     /* Here, we are starting an atomic script run.  This is
11386                      * handled by recursing to deal with the atomic portion
11387                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11388
11389                     ret = reg_node(pRExC_state, SROPEN);
11390
11391                     RExC_in_script_run = 1;
11392
11393                     atomic = reg(pRExC_state, 'r', &flags, depth);
11394                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11395                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11396                         return 0;
11397                     }
11398
11399                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11400                         REQUIRE_BRANCHJ(flagp, 0);
11401                     }
11402
11403                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11404                                                                 SRCLOSE)))
11405                     {
11406                         REQUIRE_BRANCHJ(flagp, 0);
11407                     }
11408
11409                     RExC_in_script_run = 0;
11410                     return ret;
11411                 }
11412
11413                 break;
11414
11415             lookbehind_alpha_assertions:
11416                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11417                 /*FALLTHROUGH*/
11418
11419             alpha_assertions:
11420
11421                 RExC_in_lookaround++;
11422                 RExC_seen_zerolen++;
11423
11424                 if (! start_arg) {
11425                     goto no_colon;
11426                 }
11427
11428                 /* An empty negative lookahead assertion simply is failure */
11429                 if (paren == 'A' && RExC_parse == start_arg) {
11430                     ret=reganode(pRExC_state, OPFAIL, 0);
11431                     nextchar(pRExC_state);
11432                     return ret;
11433                 }
11434
11435                 RExC_parse = start_arg;
11436                 goto parse_rest;
11437
11438               no_colon:
11439                 vFAIL2utf8f(
11440                 "'(*%" UTF8f "' requires a terminating ':'",
11441                 UTF8fARG(UTF, verb_len, start_verb));
11442                 NOT_REACHED; /*NOTREACHED*/
11443
11444             } /* End of switch */
11445             if ( ! op ) {
11446                 RExC_parse += UTF
11447                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11448                               : 1;
11449                 if (has_upper || verb_len == 0) {
11450                     vFAIL2utf8f(
11451                     "Unknown verb pattern '%" UTF8f "'",
11452                     UTF8fARG(UTF, verb_len, start_verb));
11453                 }
11454                 else {
11455                     vFAIL2utf8f(
11456                     "Unknown '(*...)' construct '%" UTF8f "'",
11457                     UTF8fARG(UTF, verb_len, start_verb));
11458                 }
11459             }
11460             if ( RExC_parse == start_arg ) {
11461                 start_arg = NULL;
11462             }
11463             if ( arg_required && !start_arg ) {
11464                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11465                     (int) verb_len, start_verb);
11466             }
11467             if (internal_argval == -1) {
11468                 ret = reganode(pRExC_state, op, 0);
11469             } else {
11470                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11471             }
11472             RExC_seen |= REG_VERBARG_SEEN;
11473             if (start_arg) {
11474                 SV *sv = newSVpvn( start_arg,
11475                                     RExC_parse - start_arg);
11476                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11477                                         STR_WITH_LEN("S"));
11478                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11479                 FLAGS(REGNODE_p(ret)) = 1;
11480             } else {
11481                 FLAGS(REGNODE_p(ret)) = 0;
11482             }
11483             if ( internal_argval != -1 )
11484                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11485             nextchar(pRExC_state);
11486             return ret;
11487         }
11488         else if (*RExC_parse == '?') { /* (?...) */
11489             bool is_logical = 0;
11490             const char * const seqstart = RExC_parse;
11491             const char * endptr;
11492             const char non_existent_group_msg[]
11493                                             = "Reference to nonexistent group";
11494             const char impossible_group[] = "Invalid reference to group";
11495
11496             if (has_intervening_patws) {
11497                 RExC_parse++;
11498                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11499             }
11500
11501             RExC_parse++;           /* past the '?' */
11502             paren = *RExC_parse;    /* might be a trailing NUL, if not
11503                                        well-formed */
11504             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11505             if (RExC_parse > RExC_end) {
11506                 paren = '\0';
11507             }
11508             ret = 0;                    /* For look-ahead/behind. */
11509             switch (paren) {
11510
11511             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11512                 paren = *RExC_parse;
11513                 if ( paren == '<') {    /* (?P<...>) named capture */
11514                     RExC_parse++;
11515                     if (RExC_parse >= RExC_end) {
11516                         vFAIL("Sequence (?P<... not terminated");
11517                     }
11518                     goto named_capture;
11519                 }
11520                 else if (paren == '>') {   /* (?P>name) named recursion */
11521                     RExC_parse++;
11522                     if (RExC_parse >= RExC_end) {
11523                         vFAIL("Sequence (?P>... not terminated");
11524                     }
11525                     goto named_recursion;
11526                 }
11527                 else if (paren == '=') {   /* (?P=...)  named backref */
11528                     RExC_parse++;
11529                     return handle_named_backref(pRExC_state, flagp,
11530                                                 parse_start, ')');
11531                 }
11532                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11533                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11534                 vFAIL3("Sequence (%.*s...) not recognized",
11535                                 (int) (RExC_parse - seqstart), seqstart);
11536                 NOT_REACHED; /*NOTREACHED*/
11537             case '<':           /* (?<...) */
11538                 /* If you want to support (?<*...), first reconcile with GH #17363 */
11539                 if (*RExC_parse == '!')
11540                     paren = ',';
11541                 else if (*RExC_parse != '=')
11542               named_capture:
11543                 {               /* (?<...>) */
11544                     char *name_start;
11545                     SV *svname;
11546                     paren= '>';
11547                 /* FALLTHROUGH */
11548             case '\'':          /* (?'...') */
11549                     name_start = RExC_parse;
11550                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11551                     if (   RExC_parse == name_start
11552                         || RExC_parse >= RExC_end
11553                         || *RExC_parse != paren)
11554                     {
11555                         vFAIL2("Sequence (?%c... not terminated",
11556                             paren=='>' ? '<' : (char) paren);
11557                     }
11558                     {
11559                         HE *he_str;
11560                         SV *sv_dat = NULL;
11561                         if (!svname) /* shouldn't happen */
11562                             Perl_croak(aTHX_
11563                                 "panic: reg_scan_name returned NULL");
11564                         if (!RExC_paren_names) {
11565                             RExC_paren_names= newHV();
11566                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11567 #ifdef DEBUGGING
11568                             RExC_paren_name_list= newAV();
11569                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11570 #endif
11571                         }
11572                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11573                         if ( he_str )
11574                             sv_dat = HeVAL(he_str);
11575                         if ( ! sv_dat ) {
11576                             /* croak baby croak */
11577                             Perl_croak(aTHX_
11578                                 "panic: paren_name hash element allocation failed");
11579                         } else if ( SvPOK(sv_dat) ) {
11580                             /* (?|...) can mean we have dupes so scan to check
11581                                its already been stored. Maybe a flag indicating
11582                                we are inside such a construct would be useful,
11583                                but the arrays are likely to be quite small, so
11584                                for now we punt -- dmq */
11585                             IV count = SvIV(sv_dat);
11586                             I32 *pv = (I32*)SvPVX(sv_dat);
11587                             IV i;
11588                             for ( i = 0 ; i < count ; i++ ) {
11589                                 if ( pv[i] == RExC_npar ) {
11590                                     count = 0;
11591                                     break;
11592                                 }
11593                             }
11594                             if ( count ) {
11595                                 pv = (I32*)SvGROW(sv_dat,
11596                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11597                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11598                                 pv[count] = RExC_npar;
11599                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11600                             }
11601                         } else {
11602                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11603                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11604                                                                 sizeof(I32));
11605                             SvIOK_on(sv_dat);
11606                             SvIV_set(sv_dat, 1);
11607                         }
11608 #ifdef DEBUGGING
11609                         /* Yes this does cause a memory leak in debugging Perls
11610                          * */
11611                         if (!av_store(RExC_paren_name_list,
11612                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11613                             SvREFCNT_dec_NN(svname);
11614 #endif
11615
11616                         /*sv_dump(sv_dat);*/
11617                     }
11618                     nextchar(pRExC_state);
11619                     paren = 1;
11620                     goto capturing_parens;
11621                 }
11622
11623                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11624                 RExC_in_lookaround++;
11625                 RExC_parse++;
11626                 if (RExC_parse >= RExC_end) {
11627                     vFAIL("Sequence (?... not terminated");
11628                 }
11629                 RExC_seen_zerolen++;
11630                 break;
11631             case '=':           /* (?=...) */
11632                 RExC_seen_zerolen++;
11633                 RExC_in_lookaround++;
11634                 break;
11635             case '!':           /* (?!...) */
11636                 RExC_seen_zerolen++;
11637                 /* check if we're really just a "FAIL" assertion */
11638                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11639                                         FALSE /* Don't force to /x */ );
11640                 if (*RExC_parse == ')') {
11641                     ret=reganode(pRExC_state, OPFAIL, 0);
11642                     nextchar(pRExC_state);
11643                     return ret;
11644                 }
11645                 RExC_in_lookaround++;
11646                 break;
11647             case '|':           /* (?|...) */
11648                 /* branch reset, behave like a (?:...) except that
11649                    buffers in alternations share the same numbers */
11650                 paren = ':';
11651                 after_freeze = freeze_paren = RExC_npar;
11652
11653                 /* XXX This construct currently requires an extra pass.
11654                  * Investigation would be required to see if that could be
11655                  * changed */
11656                 REQUIRE_PARENS_PASS;
11657                 break;
11658             case ':':           /* (?:...) */
11659             case '>':           /* (?>...) */
11660                 break;
11661             case '$':           /* (?$...) */
11662             case '@':           /* (?@...) */
11663                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11664                 break;
11665             case '0' :           /* (?0) */
11666             case 'R' :           /* (?R) */
11667                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11668                     FAIL("Sequence (?R) not terminated");
11669                 num = 0;
11670                 RExC_seen |= REG_RECURSE_SEEN;
11671
11672                 /* XXX These constructs currently require an extra pass.
11673                  * It probably could be changed */
11674                 REQUIRE_PARENS_PASS;
11675
11676                 *flagp |= POSTPONED;
11677                 goto gen_recurse_regop;
11678                 /*notreached*/
11679             /* named and numeric backreferences */
11680             case '&':            /* (?&NAME) */
11681                 parse_start = RExC_parse - 1;
11682               named_recursion:
11683                 {
11684                     SV *sv_dat = reg_scan_name(pRExC_state,
11685                                                REG_RSN_RETURN_DATA);
11686                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11687                 }
11688                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11689                     vFAIL("Sequence (?&... not terminated");
11690                 goto gen_recurse_regop;
11691                 /* NOTREACHED */
11692             case '+':
11693                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11694                     RExC_parse++;
11695                     vFAIL("Illegal pattern");
11696                 }
11697                 goto parse_recursion;
11698                 /* NOTREACHED*/
11699             case '-': /* (?-1) */
11700                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11701                     RExC_parse--; /* rewind to let it be handled later */
11702                     goto parse_flags;
11703                 }
11704                 /* FALLTHROUGH */
11705             case '1': case '2': case '3': case '4': /* (?1) */
11706             case '5': case '6': case '7': case '8': case '9':
11707                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11708               parse_recursion:
11709                 {
11710                     bool is_neg = FALSE;
11711                     UV unum;
11712                     parse_start = RExC_parse - 1; /* MJD */
11713                     if (*RExC_parse == '-') {
11714                         RExC_parse++;
11715                         is_neg = TRUE;
11716                     }
11717                     endptr = RExC_end;
11718                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11719                         && unum <= I32_MAX
11720                     ) {
11721                         num = (I32)unum;
11722                         RExC_parse = (char*)endptr;
11723                     }
11724                     else {  /* Overflow, or something like that.  Position
11725                                beyond all digits for the message */
11726                         while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
11727                             RExC_parse++;
11728                         }
11729                         vFAIL(impossible_group);
11730                     }
11731                     if (is_neg) {
11732                         /* -num is always representable on 1 and 2's complement
11733                          * machines */
11734                         num = -num;
11735                     }
11736                 }
11737                 if (*RExC_parse!=')')
11738                     vFAIL("Expecting close bracket");
11739
11740               gen_recurse_regop:
11741                 if (paren == '-' || paren == '+') {
11742
11743                     /* Don't overflow */
11744                     if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11745                         RExC_parse++;
11746                         vFAIL(impossible_group);
11747                     }
11748
11749                     /*
11750                     Diagram of capture buffer numbering.
11751                     Top line is the normal capture buffer numbers
11752                     Bottom line is the negative indexing as from
11753                     the X (the (?-2))
11754
11755                         1 2    3 4 5 X   Y      6 7
11756                        /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11757                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11758                     -   5 4    3 2 1 X   Y      x x
11759
11760                     Resolve to absolute group.  Recall that RExC_npar is +1 of
11761                     the actual parenthesis group number.  For lookahead, we
11762                     have to compensate for that.  Using the above example, when
11763                     we get to Y in the parse, num is 2 and RExC_npar is 6.  We
11764                     want 7 for +2, and 4 for -2.
11765                     */
11766                     if ( paren == '+' ) {
11767                         num--;
11768                     }
11769
11770                     num += RExC_npar;
11771
11772                     if (paren == '-' && num < 1) {
11773                         RExC_parse++;
11774                         vFAIL(non_existent_group_msg);
11775                     }
11776                 }
11777
11778                 if (num >= RExC_npar) {
11779
11780                     /* It might be a forward reference; we can't fail until we
11781                      * know, by completing the parse to get all the groups, and
11782                      * then reparsing */
11783                     if (ALL_PARENS_COUNTED)  {
11784                         if (num >= RExC_total_parens) {
11785                             RExC_parse++;
11786                             vFAIL(non_existent_group_msg);
11787                         }
11788                     }
11789                     else {
11790                         REQUIRE_PARENS_PASS;
11791                     }
11792                 }
11793
11794                 /* We keep track how many GOSUB items we have produced.
11795                    To start off the ARG2L() of the GOSUB holds its "id",
11796                    which is used later in conjunction with RExC_recurse
11797                    to calculate the offset we need to jump for the GOSUB,
11798                    which it will store in the final representation.
11799                    We have to defer the actual calculation until much later
11800                    as the regop may move.
11801                  */
11802                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11803                 RExC_recurse_count++;
11804                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11805                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11806                             22, "|    |", (int)(depth * 2 + 1), "",
11807                             (UV)ARG(REGNODE_p(ret)),
11808                             (IV)ARG2L(REGNODE_p(ret))));
11809                 RExC_seen |= REG_RECURSE_SEEN;
11810
11811                 Set_Node_Length(REGNODE_p(ret),
11812                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11813                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11814
11815                 *flagp |= POSTPONED;
11816                 assert(*RExC_parse == ')');
11817                 nextchar(pRExC_state);
11818                 return ret;
11819
11820             /* NOTREACHED */
11821
11822             case '?':           /* (??...) */
11823                 is_logical = 1;
11824                 if (*RExC_parse != '{') {
11825                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11826                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11827                     vFAIL2utf8f(
11828                         "Sequence (%" UTF8f "...) not recognized",
11829                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11830                     NOT_REACHED; /*NOTREACHED*/
11831                 }
11832                 *flagp |= POSTPONED;
11833                 paren = '{';
11834                 RExC_parse++;
11835                 /* FALLTHROUGH */
11836             case '{':           /* (?{...}) */
11837             {
11838                 U32 n = 0;
11839                 struct reg_code_block *cb;
11840                 OP * o;
11841
11842                 RExC_seen_zerolen++;
11843
11844                 if (   !pRExC_state->code_blocks
11845                     || pRExC_state->code_index
11846                                         >= pRExC_state->code_blocks->count
11847                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11848                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11849                             - RExC_start)
11850                 ) {
11851                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11852                         FAIL("panic: Sequence (?{...}): no code block found\n");
11853                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11854                 }
11855                 /* this is a pre-compiled code block (?{...}) */
11856                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11857                 RExC_parse = RExC_start + cb->end;
11858                 o = cb->block;
11859                 if (cb->src_regex) {
11860                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11861                     RExC_rxi->data->data[n] =
11862                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11863                     RExC_rxi->data->data[n+1] = (void*)o;
11864                 }
11865                 else {
11866                     n = add_data(pRExC_state,
11867                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11868                     RExC_rxi->data->data[n] = (void*)o;
11869                 }
11870                 pRExC_state->code_index++;
11871                 nextchar(pRExC_state);
11872
11873                 if (is_logical) {
11874                     regnode_offset eval;
11875                     ret = reg_node(pRExC_state, LOGICAL);
11876
11877                     eval = reg2Lanode(pRExC_state, EVAL,
11878                                        n,
11879
11880                                        /* for later propagation into (??{})
11881                                         * return value */
11882                                        RExC_flags & RXf_PMf_COMPILETIME
11883                                       );
11884                     FLAGS(REGNODE_p(ret)) = 2;
11885                     if (! REGTAIL(pRExC_state, ret, eval)) {
11886                         REQUIRE_BRANCHJ(flagp, 0);
11887                     }
11888                     /* deal with the length of this later - MJD */
11889                     return ret;
11890                 }
11891                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11892                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11893                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11894                 return ret;
11895             }
11896             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11897             {
11898                 int is_define= 0;
11899                 const int DEFINE_len = sizeof("DEFINE") - 1;
11900                 if (    RExC_parse < RExC_end - 1
11901                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11902                             && (   RExC_parse[1] == '='
11903                                 || RExC_parse[1] == '!'
11904                                 || RExC_parse[1] == '<'
11905                                 || RExC_parse[1] == '{'))
11906                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11907                             && (   memBEGINs(RExC_parse + 1,
11908                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11909                                          "pla:")
11910                                 || memBEGINs(RExC_parse + 1,
11911                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11912                                          "plb:")
11913                                 || memBEGINs(RExC_parse + 1,
11914                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11915                                          "nla:")
11916                                 || memBEGINs(RExC_parse + 1,
11917                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11918                                          "nlb:")
11919                                 || memBEGINs(RExC_parse + 1,
11920                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11921                                          "positive_lookahead:")
11922                                 || memBEGINs(RExC_parse + 1,
11923                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11924                                          "positive_lookbehind:")
11925                                 || memBEGINs(RExC_parse + 1,
11926                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11927                                          "negative_lookahead:")
11928                                 || memBEGINs(RExC_parse + 1,
11929                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11930                                          "negative_lookbehind:"))))
11931                 ) { /* Lookahead or eval. */
11932                     I32 flag;
11933                     regnode_offset tail;
11934
11935                     ret = reg_node(pRExC_state, LOGICAL);
11936                     FLAGS(REGNODE_p(ret)) = 1;
11937
11938                     tail = reg(pRExC_state, 1, &flag, depth+1);
11939                     RETURN_FAIL_ON_RESTART(flag, flagp);
11940                     if (! REGTAIL(pRExC_state, ret, tail)) {
11941                         REQUIRE_BRANCHJ(flagp, 0);
11942                     }
11943                     goto insert_if;
11944                 }
11945                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11946                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11947                 {
11948                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11949                     char *name_start= RExC_parse++;
11950                     U32 num = 0;
11951                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11952                     if (   RExC_parse == name_start
11953                         || RExC_parse >= RExC_end
11954                         || *RExC_parse != ch)
11955                     {
11956                         vFAIL2("Sequence (?(%c... not terminated",
11957                             (ch == '>' ? '<' : ch));
11958                     }
11959                     RExC_parse++;
11960                     if (sv_dat) {
11961                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11962                         RExC_rxi->data->data[num]=(void*)sv_dat;
11963                         SvREFCNT_inc_simple_void_NN(sv_dat);
11964                     }
11965                     ret = reganode(pRExC_state, GROUPPN, num);
11966                     goto insert_if_check_paren;
11967                 }
11968                 else if (memBEGINs(RExC_parse,
11969                                    (STRLEN) (RExC_end - RExC_parse),
11970                                    "DEFINE"))
11971                 {
11972                     ret = reganode(pRExC_state, DEFINEP, 0);
11973                     RExC_parse += DEFINE_len;
11974                     is_define = 1;
11975                     goto insert_if_check_paren;
11976                 }
11977                 else if (RExC_parse[0] == 'R') {
11978                     RExC_parse++;
11979                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11980                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11981                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11982                      */
11983                     parno = 0;
11984                     if (RExC_parse[0] == '0') {
11985                         parno = 1;
11986                         RExC_parse++;
11987                     }
11988                     else if (inRANGE(RExC_parse[0], '1', '9')) {
11989                         UV uv;
11990                         endptr = RExC_end;
11991                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11992                             && uv <= I32_MAX
11993                         ) {
11994                             parno = (I32)uv + 1;
11995                             RExC_parse = (char*)endptr;
11996                         }
11997                         /* else "Switch condition not recognized" below */
11998                     } else if (RExC_parse[0] == '&') {
11999                         SV *sv_dat;
12000                         RExC_parse++;
12001                         sv_dat = reg_scan_name(pRExC_state,
12002                                                REG_RSN_RETURN_DATA);
12003                         if (sv_dat)
12004                             parno = 1 + *((I32 *)SvPVX(sv_dat));
12005                     }
12006                     ret = reganode(pRExC_state, INSUBP, parno);
12007                     goto insert_if_check_paren;
12008                 }
12009                 else if (inRANGE(RExC_parse[0], '1', '9')) {
12010                     /* (?(1)...) */
12011                     char c;
12012                     UV uv;
12013                     endptr = RExC_end;
12014                     if (grok_atoUV(RExC_parse, &uv, &endptr)
12015                         && uv <= I32_MAX
12016                     ) {
12017                         parno = (I32)uv;
12018                         RExC_parse = (char*)endptr;
12019                     }
12020                     else {
12021                         vFAIL("panic: grok_atoUV returned FALSE");
12022                     }
12023                     ret = reganode(pRExC_state, GROUPP, parno);
12024
12025                  insert_if_check_paren:
12026                     if (UCHARAT(RExC_parse) != ')') {
12027                         RExC_parse += UTF
12028                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12029                                       : 1;
12030                         vFAIL("Switch condition not recognized");
12031                     }
12032                     nextchar(pRExC_state);
12033                   insert_if:
12034                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12035                                                              IFTHEN, 0)))
12036                     {
12037                         REQUIRE_BRANCHJ(flagp, 0);
12038                     }
12039                     br = regbranch(pRExC_state, &flags, 1, depth+1);
12040                     if (br == 0) {
12041                         RETURN_FAIL_ON_RESTART(flags,flagp);
12042                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12043                               (UV) flags);
12044                     } else
12045                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12046                                                              LONGJMP, 0)))
12047                     {
12048                         REQUIRE_BRANCHJ(flagp, 0);
12049                     }
12050                     c = UCHARAT(RExC_parse);
12051                     nextchar(pRExC_state);
12052                     if (flags&HASWIDTH)
12053                         *flagp |= HASWIDTH;
12054                     if (c == '|') {
12055                         if (is_define)
12056                             vFAIL("(?(DEFINE)....) does not allow branches");
12057
12058                         /* Fake one for optimizer.  */
12059                         lastbr = reganode(pRExC_state, IFTHEN, 0);
12060
12061                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12062                             RETURN_FAIL_ON_RESTART(flags, flagp);
12063                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12064                                   (UV) flags);
12065                         }
12066                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
12067                             REQUIRE_BRANCHJ(flagp, 0);
12068                         }
12069                         if (flags&HASWIDTH)
12070                             *flagp |= HASWIDTH;
12071                         c = UCHARAT(RExC_parse);
12072                         nextchar(pRExC_state);
12073                     }
12074                     else
12075                         lastbr = 0;
12076                     if (c != ')') {
12077                         if (RExC_parse >= RExC_end)
12078                             vFAIL("Switch (?(condition)... not terminated");
12079                         else
12080                             vFAIL("Switch (?(condition)... contains too many branches");
12081                     }
12082                     ender = reg_node(pRExC_state, TAIL);
12083                     if (! REGTAIL(pRExC_state, br, ender)) {
12084                         REQUIRE_BRANCHJ(flagp, 0);
12085                     }
12086                     if (lastbr) {
12087                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12088                             REQUIRE_BRANCHJ(flagp, 0);
12089                         }
12090                         if (! REGTAIL(pRExC_state,
12091                                       REGNODE_OFFSET(
12092                                                  NEXTOPER(
12093                                                  NEXTOPER(REGNODE_p(lastbr)))),
12094                                       ender))
12095                         {
12096                             REQUIRE_BRANCHJ(flagp, 0);
12097                         }
12098                     }
12099                     else
12100                         if (! REGTAIL(pRExC_state, ret, ender)) {
12101                             REQUIRE_BRANCHJ(flagp, 0);
12102                         }
12103 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
12104                     RExC_size++; /* XXX WHY do we need this?!!
12105                                     For large programs it seems to be required
12106                                     but I can't figure out why. -- dmq*/
12107 #endif
12108                     return ret;
12109                 }
12110                 RExC_parse += UTF
12111                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12112                               : 1;
12113                 vFAIL("Unknown switch condition (?(...))");
12114             }
12115             case '[':           /* (?[ ... ]) */
12116                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12117                                          oregcomp_parse);
12118             case 0: /* A NUL */
12119                 RExC_parse--; /* for vFAIL to print correctly */
12120                 vFAIL("Sequence (? incomplete");
12121                 break;
12122
12123             case ')':
12124                 if (RExC_strict) {  /* [perl #132851] */
12125                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12126                 }
12127                 /* FALLTHROUGH */
12128             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12129             /* FALLTHROUGH */
12130             default: /* e.g., (?i) */
12131                 RExC_parse = (char *) seqstart + 1;
12132               parse_flags:
12133                 parse_lparen_question_flags(pRExC_state);
12134                 if (UCHARAT(RExC_parse) != ':') {
12135                     if (RExC_parse < RExC_end)
12136                         nextchar(pRExC_state);
12137                     *flagp = TRYAGAIN;
12138                     return 0;
12139                 }
12140                 paren = ':';
12141                 nextchar(pRExC_state);
12142                 ret = 0;
12143                 goto parse_rest;
12144             } /* end switch */
12145         }
12146         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12147           capturing_parens:
12148             parno = RExC_npar;
12149             RExC_npar++;
12150             if (! ALL_PARENS_COUNTED) {
12151                 /* If we are in our first pass through (and maybe only pass),
12152                  * we  need to allocate memory for the capturing parentheses
12153                  * data structures.
12154                  */
12155
12156                 if (!RExC_parens_buf_size) {
12157                     /* first guess at number of parens we might encounter */
12158                     RExC_parens_buf_size = 10;
12159
12160                     /* setup RExC_open_parens, which holds the address of each
12161                      * OPEN tag, and to make things simpler for the 0 index the
12162                      * start of the program - this is used later for offsets */
12163                     Newxz(RExC_open_parens, RExC_parens_buf_size,
12164                             regnode_offset);
12165                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12166
12167                     /* setup RExC_close_parens, which holds the address of each
12168                      * CLOSE tag, and to make things simpler for the 0 index
12169                      * the end of the program - this is used later for offsets
12170                      * */
12171                     Newxz(RExC_close_parens, RExC_parens_buf_size,
12172                             regnode_offset);
12173                     /* we dont know where end op starts yet, so we dont need to
12174                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12175                      * above */
12176                 }
12177                 else if (RExC_npar > RExC_parens_buf_size) {
12178                     I32 old_size = RExC_parens_buf_size;
12179
12180                     RExC_parens_buf_size *= 2;
12181
12182                     Renew(RExC_open_parens, RExC_parens_buf_size,
12183                             regnode_offset);
12184                     Zero(RExC_open_parens + old_size,
12185                             RExC_parens_buf_size - old_size, regnode_offset);
12186
12187                     Renew(RExC_close_parens, RExC_parens_buf_size,
12188                             regnode_offset);
12189                     Zero(RExC_close_parens + old_size,
12190                             RExC_parens_buf_size - old_size, regnode_offset);
12191                 }
12192             }
12193
12194             ret = reganode(pRExC_state, OPEN, parno);
12195             if (!RExC_nestroot)
12196                 RExC_nestroot = parno;
12197             if (RExC_open_parens && !RExC_open_parens[parno])
12198             {
12199                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12200                     "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12201                     22, "|    |", (int)(depth * 2 + 1), "",
12202                     (IV)parno, ret));
12203                 RExC_open_parens[parno]= ret;
12204             }
12205
12206             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12207             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12208             is_open = 1;
12209         } else {
12210             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12211             paren = ':';
12212             ret = 0;
12213         }
12214     }
12215     else                        /* ! paren */
12216         ret = 0;
12217
12218    parse_rest:
12219     /* Pick up the branches, linking them together. */
12220     parse_start = RExC_parse;   /* MJD */
12221     br = regbranch(pRExC_state, &flags, 1, depth+1);
12222
12223     /*     branch_len = (paren != 0); */
12224
12225     if (br == 0) {
12226         RETURN_FAIL_ON_RESTART(flags, flagp);
12227         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12228     }
12229     if (*RExC_parse == '|') {
12230         if (RExC_use_BRANCHJ) {
12231             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12232         }
12233         else {                  /* MJD */
12234             reginsert(pRExC_state, BRANCH, br, depth+1);
12235             Set_Node_Length(REGNODE_p(br), paren != 0);
12236             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12237         }
12238         have_branch = 1;
12239     }
12240     else if (paren == ':') {
12241         *flagp |= flags&SIMPLE;
12242     }
12243     if (is_open) {                              /* Starts with OPEN. */
12244         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12245             REQUIRE_BRANCHJ(flagp, 0);
12246         }
12247     }
12248     else if (paren != '?')              /* Not Conditional */
12249         ret = br;
12250     *flagp |= flags & (HASWIDTH | POSTPONED);
12251     lastbr = br;
12252     while (*RExC_parse == '|') {
12253         if (RExC_use_BRANCHJ) {
12254             bool shut_gcc_up;
12255
12256             ender = reganode(pRExC_state, LONGJMP, 0);
12257
12258             /* Append to the previous. */
12259             shut_gcc_up = REGTAIL(pRExC_state,
12260                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12261                          ender);
12262             PERL_UNUSED_VAR(shut_gcc_up);
12263         }
12264         nextchar(pRExC_state);
12265         if (freeze_paren) {
12266             if (RExC_npar > after_freeze)
12267                 after_freeze = RExC_npar;
12268             RExC_npar = freeze_paren;
12269         }
12270         br = regbranch(pRExC_state, &flags, 0, depth+1);
12271
12272         if (br == 0) {
12273             RETURN_FAIL_ON_RESTART(flags, flagp);
12274             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12275         }
12276         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12277             REQUIRE_BRANCHJ(flagp, 0);
12278         }
12279         lastbr = br;
12280         *flagp |= flags & (HASWIDTH | POSTPONED);
12281     }
12282
12283     if (have_branch || paren != ':') {
12284         regnode * br;
12285
12286         /* Make a closing node, and hook it on the end. */
12287         switch (paren) {
12288         case ':':
12289             ender = reg_node(pRExC_state, TAIL);
12290             break;
12291         case 1: case 2:
12292             ender = reganode(pRExC_state, CLOSE, parno);
12293             if ( RExC_close_parens ) {
12294                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12295                         "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12296                         22, "|    |", (int)(depth * 2 + 1), "",
12297                         (IV)parno, ender));
12298                 RExC_close_parens[parno]= ender;
12299                 if (RExC_nestroot == parno)
12300                     RExC_nestroot = 0;
12301             }
12302             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12303             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12304             break;
12305         case 's':
12306             ender = reg_node(pRExC_state, SRCLOSE);
12307             RExC_in_script_run = 0;
12308             break;
12309         case '<':
12310         case 'a':
12311         case 'A':
12312         case 'b':
12313         case 'B':
12314         case ',':
12315         case '=':
12316         case '!':
12317             *flagp &= ~HASWIDTH;
12318             /* FALLTHROUGH */
12319         case 't':   /* aTomic */
12320         case '>':
12321             ender = reg_node(pRExC_state, SUCCEED);
12322             break;
12323         case 0:
12324             ender = reg_node(pRExC_state, END);
12325             assert(!RExC_end_op); /* there can only be one! */
12326             RExC_end_op = REGNODE_p(ender);
12327             if (RExC_close_parens) {
12328                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12329                     "%*s%*s Setting close paren #0 (END) to %zu\n",
12330                     22, "|    |", (int)(depth * 2 + 1), "",
12331                     ender));
12332
12333                 RExC_close_parens[0]= ender;
12334             }
12335             break;
12336         }
12337         DEBUG_PARSE_r({
12338             DEBUG_PARSE_MSG("lsbr");
12339             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12340             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12341             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12342                           SvPV_nolen_const(RExC_mysv1),
12343                           (IV)lastbr,
12344                           SvPV_nolen_const(RExC_mysv2),
12345                           (IV)ender,
12346                           (IV)(ender - lastbr)
12347             );
12348         });
12349         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12350             REQUIRE_BRANCHJ(flagp, 0);
12351         }
12352
12353         if (have_branch) {
12354             char is_nothing= 1;
12355             if (depth==1)
12356                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12357
12358             /* Hook the tails of the branches to the closing node. */
12359             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12360                 const U8 op = PL_regkind[OP(br)];
12361                 if (op == BRANCH) {
12362                     if (! REGTAIL_STUDY(pRExC_state,
12363                                         REGNODE_OFFSET(NEXTOPER(br)),
12364                                         ender))
12365                     {
12366                         REQUIRE_BRANCHJ(flagp, 0);
12367                     }
12368                     if ( OP(NEXTOPER(br)) != NOTHING
12369                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12370                         is_nothing= 0;
12371                 }
12372                 else if (op == BRANCHJ) {
12373                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12374                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12375                                         ender);
12376                     PERL_UNUSED_VAR(shut_gcc_up);
12377                     /* for now we always disable this optimisation * /
12378                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12379                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12380                     */
12381                         is_nothing= 0;
12382                 }
12383             }
12384             if (is_nothing) {
12385                 regnode * ret_as_regnode = REGNODE_p(ret);
12386                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12387                                ? regnext(ret_as_regnode)
12388                                : ret_as_regnode;
12389                 DEBUG_PARSE_r({
12390                     DEBUG_PARSE_MSG("NADA");
12391                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12392                                      NULL, pRExC_state);
12393                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12394                                      NULL, pRExC_state);
12395                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12396                                   SvPV_nolen_const(RExC_mysv1),
12397                                   (IV)REG_NODE_NUM(ret_as_regnode),
12398                                   SvPV_nolen_const(RExC_mysv2),
12399                                   (IV)ender,
12400                                   (IV)(ender - ret)
12401                     );
12402                 });
12403                 OP(br)= NOTHING;
12404                 if (OP(REGNODE_p(ender)) == TAIL) {
12405                     NEXT_OFF(br)= 0;
12406                     RExC_emit= REGNODE_OFFSET(br) + 1;
12407                 } else {
12408                     regnode *opt;
12409                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12410                         OP(opt)= OPTIMIZED;
12411                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12412                 }
12413             }
12414         }
12415     }
12416
12417     {
12418         const char *p;
12419          /* Even/odd or x=don't care: 010101x10x */
12420         static const char parens[] = "=!aA<,>Bbt";
12421          /* flag below is set to 0 up through 'A'; 1 for larger */
12422
12423         if (paren && (p = strchr(parens, paren))) {
12424             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12425             int flag = (p - parens) > 3;
12426
12427             if (paren == '>' || paren == 't') {
12428                 node = SUSPEND, flag = 0;
12429             }
12430
12431             reginsert(pRExC_state, node, ret, depth+1);
12432             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12433             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12434             FLAGS(REGNODE_p(ret)) = flag;
12435             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12436             {
12437                 REQUIRE_BRANCHJ(flagp, 0);
12438             }
12439         }
12440     }
12441
12442     /* Check for proper termination. */
12443     if (paren) {
12444         /* restore original flags, but keep (?p) and, if we've encountered
12445          * something in the parse that changes /d rules into /u, keep the /u */
12446         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12447         if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
12448             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12449         }
12450         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12451             RExC_parse = oregcomp_parse;
12452             vFAIL("Unmatched (");
12453         }
12454         nextchar(pRExC_state);
12455     }
12456     else if (!paren && RExC_parse < RExC_end) {
12457         if (*RExC_parse == ')') {
12458             RExC_parse++;
12459             vFAIL("Unmatched )");
12460         }
12461         else
12462             FAIL("Junk on end of regexp");      /* "Can't happen". */
12463         NOT_REACHED; /* NOTREACHED */
12464     }
12465
12466     if (after_freeze > RExC_npar)
12467         RExC_npar = after_freeze;
12468
12469     RExC_in_lookaround = was_in_lookaround;
12470
12471     return(ret);
12472 }
12473
12474 /*
12475  - regbranch - one alternative of an | operator
12476  *
12477  * Implements the concatenation operator.
12478  *
12479  * On success, returns the offset at which any next node should be placed into
12480  * the regex engine program being compiled.
12481  *
12482  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12483  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12484  * UTF-8
12485  */
12486 STATIC regnode_offset
12487 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12488 {
12489     regnode_offset ret;
12490     regnode_offset chain = 0;
12491     regnode_offset latest;
12492     I32 flags = 0, c = 0;
12493     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12494
12495     PERL_ARGS_ASSERT_REGBRANCH;
12496
12497     DEBUG_PARSE("brnc");
12498
12499     if (first)
12500         ret = 0;
12501     else {
12502         if (RExC_use_BRANCHJ)
12503             ret = reganode(pRExC_state, BRANCHJ, 0);
12504         else {
12505             ret = reg_node(pRExC_state, BRANCH);
12506             Set_Node_Length(REGNODE_p(ret), 1);
12507         }
12508     }
12509
12510     *flagp = 0;                 /* Initialize. */
12511
12512     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12513                             FALSE /* Don't force to /x */ );
12514     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12515         flags &= ~TRYAGAIN;
12516         latest = regpiece(pRExC_state, &flags, depth+1);
12517         if (latest == 0) {
12518             if (flags & TRYAGAIN)
12519                 continue;
12520             RETURN_FAIL_ON_RESTART(flags, flagp);
12521             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12522         }
12523         else if (ret == 0)
12524             ret = latest;
12525         *flagp |= flags&(HASWIDTH|POSTPONED);
12526         if (chain != 0) {
12527             /* FIXME adding one for every branch after the first is probably
12528              * excessive now we have TRIE support. (hv) */
12529             MARK_NAUGHTY(1);
12530             if (! REGTAIL(pRExC_state, chain, latest)) {
12531                 /* XXX We could just redo this branch, but figuring out what
12532                  * bookkeeping needs to be reset is a pain, and it's likely
12533                  * that other branches that goto END will also be too large */
12534                 REQUIRE_BRANCHJ(flagp, 0);
12535             }
12536         }
12537         chain = latest;
12538         c++;
12539     }
12540     if (chain == 0) {   /* Loop ran zero times. */
12541         chain = reg_node(pRExC_state, NOTHING);
12542         if (ret == 0)
12543             ret = chain;
12544     }
12545     if (c == 1) {
12546         *flagp |= flags&SIMPLE;
12547     }
12548
12549     return ret;
12550 }
12551
12552 /*
12553  - regcurly - a little FSA that accepts {\d+,?\d*}
12554     Pulled from reg.c.
12555  */
12556 bool
12557 Perl_regcurly(const char *s)
12558 {
12559     PERL_ARGS_ASSERT_REGCURLY;
12560
12561     if (*s++ != '{')
12562         return FALSE;
12563     if (!isDIGIT(*s))
12564         return FALSE;
12565     while (isDIGIT(*s))
12566         s++;
12567     if (*s == ',') {
12568         s++;
12569         while (isDIGIT(*s))
12570             s++;
12571     }
12572
12573     return *s == '}';
12574 }
12575
12576 /*
12577  - regpiece - something followed by possible quantifier * + ? {n,m}
12578  *
12579  * Note that the branching code sequences used for ? and the general cases
12580  * of * and + are somewhat optimized:  they use the same NOTHING node as
12581  * both the endmarker for their branch list and the body of the last branch.
12582  * It might seem that this node could be dispensed with entirely, but the
12583  * endmarker role is not redundant.
12584  *
12585  * On success, returns the offset at which any next node should be placed into
12586  * the regex engine program being compiled.
12587  *
12588  * Returns 0 otherwise, with *flagp set to indicate why:
12589  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12590  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12591  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12592  */
12593 STATIC regnode_offset
12594 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12595 {
12596     regnode_offset ret;
12597     char op;
12598     char *next;
12599     I32 flags;
12600     const char * const origparse = RExC_parse;
12601     I32 min;
12602     I32 max = REG_INFTY;
12603 #ifdef RE_TRACK_PATTERN_OFFSETS
12604     char *parse_start;
12605 #endif
12606     const char *maxpos = NULL;
12607     UV uv;
12608
12609     /* Save the original in case we change the emitted regop to a FAIL. */
12610     const regnode_offset orig_emit = RExC_emit;
12611
12612     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12613
12614     PERL_ARGS_ASSERT_REGPIECE;
12615
12616     DEBUG_PARSE("piec");
12617
12618     ret = regatom(pRExC_state, &flags, depth+1);
12619     if (ret == 0) {
12620         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12621         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12622     }
12623
12624 #ifdef RE_TRACK_PATTERN_OFFSETS
12625     parse_start = RExC_parse;
12626 #endif
12627
12628     op = *RExC_parse;
12629     switch (op) {
12630
12631       case '*':
12632         nextchar(pRExC_state);
12633         min = 0;
12634         break;
12635
12636       case '+':
12637         nextchar(pRExC_state);
12638         min = 1;
12639         break;
12640
12641       case '?':
12642         nextchar(pRExC_state);
12643         min = 0; max = 1;
12644         break;
12645
12646       case '{':  /* A '{' may or may not indicate a quantifier; call regcurly()
12647                     to determine which */
12648         if (regcurly(RExC_parse)) {
12649             const char* endptr;
12650
12651             /* Here is a quantifier, parse for min and max values */
12652             maxpos = NULL;
12653             next = RExC_parse + 1;
12654             while (isDIGIT(*next) || *next == ',') {
12655                 if (*next == ',') {
12656                     if (maxpos)
12657                         break;
12658                     else
12659                         maxpos = next;
12660                 }
12661                 next++;
12662             }
12663
12664             assert(*next == '}');
12665
12666             if (!maxpos)
12667                 maxpos = next;
12668             RExC_parse++;
12669             if (isDIGIT(*RExC_parse)) {
12670                 endptr = RExC_end;
12671                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12672                     vFAIL("Invalid quantifier in {,}");
12673                 if (uv >= REG_INFTY)
12674                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12675                 min = (I32)uv;
12676             } else {
12677                 min = 0;
12678             }
12679             if (*maxpos == ',')
12680                 maxpos++;
12681             else
12682                 maxpos = RExC_parse;
12683             if (isDIGIT(*maxpos)) {
12684                 endptr = RExC_end;
12685                 if (!grok_atoUV(maxpos, &uv, &endptr))
12686                     vFAIL("Invalid quantifier in {,}");
12687                 if (uv >= REG_INFTY)
12688                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12689                 max = (I32)uv;
12690             } else {
12691                 max = REG_INFTY;            /* meaning "infinity" */
12692             }
12693
12694             RExC_parse = next;
12695             nextchar(pRExC_state);
12696             if (max < min) {    /* If can't match, warn and optimize to fail
12697                                    unconditionally */
12698                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12699                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12700                 NEXT_OFF(REGNODE_p(orig_emit)) =
12701                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12702                 return ret;
12703             }
12704             else if (min == max && *RExC_parse == '?')
12705             {
12706                 ckWARN2reg(RExC_parse + 1,
12707                            "Useless use of greediness modifier '%c'",
12708                            *RExC_parse);
12709             }
12710
12711             break;
12712         } /* End of is regcurly() */
12713
12714         /* Here was a '{', but what followed it didn't form a quantifier. */
12715         /* FALLTHROUGH */
12716
12717       default:
12718         *flagp = flags;
12719         return(ret);
12720         NOT_REACHED; /*NOTREACHED*/
12721     }
12722
12723     /* Here we have a quantifier, and have calculated 'min' and 'max'.
12724      *
12725      * Check and possibly adjust a zero width operand */
12726     if (! (flags & (HASWIDTH|POSTPONED))) {
12727         if (max > REG_INFTY/3) {
12728             if (origparse[0] == '\\' && origparse[1] == 'K') {
12729                 vFAIL2utf8f(
12730                            "%" UTF8f " is forbidden - matches null string"
12731                            " many times",
12732                            UTF8fARG(UTF, (RExC_parse >= origparse
12733                                          ? RExC_parse - origparse
12734                                          : 0),
12735                            origparse));
12736             } else {
12737                 ckWARN2reg(RExC_parse,
12738                            "%" UTF8f " matches null string many times",
12739                            UTF8fARG(UTF, (RExC_parse >= origparse
12740                                          ? RExC_parse - origparse
12741                                          : 0),
12742                            origparse));
12743             }
12744         }
12745
12746         /* There's no point in trying to match something 0 length more than
12747          * once except for extra side effects, which we don't have here since
12748          * not POSTPONED */
12749         if (max > 1) {
12750             max = 1;
12751             if (min > max) {
12752                 min = max;
12753             }
12754         }
12755     }
12756
12757     /* If this is a code block pass it up */
12758     *flagp |= (flags & POSTPONED);
12759
12760     if (max > 0) {
12761         *flagp |= (flags & HASWIDTH);
12762         if (max == REG_INFTY)
12763             RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12764     }
12765
12766     /* 'SIMPLE' operands don't require full generality */
12767     if ((flags&SIMPLE)) {
12768         if (max == REG_INFTY) {
12769             if (min == 0) {
12770                 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
12771                     goto min0_maxINF_wildcard_forbidden;
12772                 }
12773
12774                 reginsert(pRExC_state, STAR, ret, depth+1);
12775                 MARK_NAUGHTY(4);
12776                 goto done_main_op;
12777             }
12778             else if (min == 1) {
12779                 reginsert(pRExC_state, PLUS, ret, depth+1);
12780                 MARK_NAUGHTY(3);
12781                 goto done_main_op;
12782             }
12783         }
12784
12785         /* Here, SIMPLE, but not the '*' and '+' special cases */
12786
12787         MARK_NAUGHTY_EXP(2, 2);
12788         reginsert(pRExC_state, CURLY, ret, depth+1);
12789         Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12790         Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12791     }
12792     else {  /* not SIMPLE */
12793         const regnode_offset w = reg_node(pRExC_state, WHILEM);
12794
12795         FLAGS(REGNODE_p(w)) = 0;
12796         if (!  REGTAIL(pRExC_state, ret, w)) {
12797             REQUIRE_BRANCHJ(flagp, 0);
12798         }
12799         if (RExC_use_BRANCHJ) {
12800             reginsert(pRExC_state, LONGJMP, ret, depth+1);
12801             reginsert(pRExC_state, NOTHING, ret, depth+1);
12802             NEXT_OFF(REGNODE_p(ret)) = 3;        /* Go over LONGJMP. */
12803         }
12804         reginsert(pRExC_state, CURLYX, ret, depth+1);
12805                         /* MJD hk */
12806         Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12807         Set_Node_Length(REGNODE_p(ret),
12808                         op == '{' ? (RExC_parse - parse_start) : 1);
12809
12810         if (RExC_use_BRANCHJ)
12811             NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12812                                                LONGJMP. */
12813         if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12814                                                   NOTHING)))
12815         {
12816             REQUIRE_BRANCHJ(flagp, 0);
12817         }
12818         RExC_whilem_seen++;
12819         MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12820     }
12821
12822     /* Finish up the CURLY/CURLYX case */
12823     FLAGS(REGNODE_p(ret)) = 0;
12824
12825     ARG1_SET(REGNODE_p(ret), (U16)min);
12826     ARG2_SET(REGNODE_p(ret), (U16)max);
12827
12828   done_main_op:
12829
12830     /* Process any greediness modifiers */
12831     if (*RExC_parse == '?') {
12832         nextchar(pRExC_state);
12833         reginsert(pRExC_state, MINMOD, ret, depth+1);
12834         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12835             REQUIRE_BRANCHJ(flagp, 0);
12836         }
12837     }
12838     else if (*RExC_parse == '+') {
12839         regnode_offset ender;
12840         nextchar(pRExC_state);
12841         ender = reg_node(pRExC_state, SUCCEED);
12842         if (! REGTAIL(pRExC_state, ret, ender)) {
12843             REQUIRE_BRANCHJ(flagp, 0);
12844         }
12845         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12846         ender = reg_node(pRExC_state, TAIL);
12847         if (! REGTAIL(pRExC_state, ret, ender)) {
12848             REQUIRE_BRANCHJ(flagp, 0);
12849         }
12850     }
12851
12852     /* Forbid extra quantifiers */
12853     if (ISMULT2(RExC_parse)) {
12854         RExC_parse++;
12855         vFAIL("Nested quantifiers");
12856     }
12857
12858     return(ret);
12859
12860   min0_maxINF_wildcard_forbidden:
12861
12862     /* Here we are in a wildcard match, and the minimum match length is 0, and
12863      * the max could be infinity.  This is currently forbidden.  The only
12864      * reason is to make it harder to write patterns that take a long long time
12865      * to halt, and because the use of this construct isn't necessary in
12866      * matching Unicode property values */
12867     RExC_parse++;
12868     /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
12869        subpatterns in regex; marked by <-- HERE in m/%s/
12870      */
12871     vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
12872           " subpatterns");
12873
12874     /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
12875      * legal at all in wildcards, so can't get this far */
12876
12877     NOT_REACHED; /*NOTREACHED*/
12878 }
12879
12880 STATIC bool
12881 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12882                 regnode_offset * node_p,
12883                 UV * code_point_p,
12884                 int * cp_count,
12885                 I32 * flagp,
12886                 const bool strict,
12887                 const U32 depth
12888     )
12889 {
12890  /* This routine teases apart the various meanings of \N and returns
12891   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12892   * in the current context.
12893   *
12894   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12895   *
12896   * If <code_point_p> is not NULL, the context is expecting the result to be a
12897   * single code point.  If this \N instance turns out to a single code point,
12898   * the function returns TRUE and sets *code_point_p to that code point.
12899   *
12900   * If <node_p> is not NULL, the context is expecting the result to be one of
12901   * the things representable by a regnode.  If this \N instance turns out to be
12902   * one such, the function generates the regnode, returns TRUE and sets *node_p
12903   * to point to the offset of that regnode into the regex engine program being
12904   * compiled.
12905   *
12906   * If this instance of \N isn't legal in any context, this function will
12907   * generate a fatal error and not return.
12908   *
12909   * On input, RExC_parse should point to the first char following the \N at the
12910   * time of the call.  On successful return, RExC_parse will have been updated
12911   * to point to just after the sequence identified by this routine.  Also
12912   * *flagp has been updated as needed.
12913   *
12914   * When there is some problem with the current context and this \N instance,
12915   * the function returns FALSE, without advancing RExC_parse, nor setting
12916   * *node_p, nor *code_point_p, nor *flagp.
12917   *
12918   * If <cp_count> is not NULL, the caller wants to know the length (in code
12919   * points) that this \N sequence matches.  This is set, and the input is
12920   * parsed for errors, even if the function returns FALSE, as detailed below.
12921   *
12922   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12923   *
12924   * Probably the most common case is for the \N to specify a single code point.
12925   * *cp_count will be set to 1, and *code_point_p will be set to that code
12926   * point.
12927   *
12928   * Another possibility is for the input to be an empty \N{}.  This is no
12929   * longer accepted, and will generate a fatal error.
12930   *
12931   * Another possibility is for a custom charnames handler to be in effect which
12932   * translates the input name to an empty string.  *cp_count will be set to 0.
12933   * *node_p will be set to a generated NOTHING node.
12934   *
12935   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12936   * set to 0. *node_p will be set to a generated REG_ANY node.
12937   *
12938   * The fifth possibility is that \N resolves to a sequence of more than one
12939   * code points.  *cp_count will be set to the number of code points in the
12940   * sequence. *node_p will be set to a generated node returned by this
12941   * function calling S_reg().
12942   *
12943   * The final possibility is that it is premature to be calling this function;
12944   * the parse needs to be restarted.  This can happen when this changes from
12945   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12946   * latter occurs only when the fifth possibility would otherwise be in
12947   * effect, and is because one of those code points requires the pattern to be
12948   * recompiled as UTF-8.  The function returns FALSE, and sets the
12949   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12950   * happens, the caller needs to desist from continuing parsing, and return
12951   * this information to its caller.  This is not set for when there is only one
12952   * code point, as this can be called as part of an ANYOF node, and they can
12953   * store above-Latin1 code points without the pattern having to be in UTF-8.
12954   *
12955   * For non-single-quoted regexes, the tokenizer has resolved character and
12956   * sequence names inside \N{...} into their Unicode values, normalizing the
12957   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12958   * hex-represented code points in the sequence.  This is done there because
12959   * the names can vary based on what charnames pragma is in scope at the time,
12960   * so we need a way to take a snapshot of what they resolve to at the time of
12961   * the original parse. [perl #56444].
12962   *
12963   * That parsing is skipped for single-quoted regexes, so here we may get
12964   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12965   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12966   * the native character set for non-ASCII platforms.  The other possibilities
12967   * are already native, so no translation is done. */
12968
12969     char * endbrace;    /* points to '}' following the name */
12970     char* p = RExC_parse; /* Temporary */
12971
12972     SV * substitute_parse = NULL;
12973     char *orig_end;
12974     char *save_start;
12975     I32 flags;
12976
12977     DECLARE_AND_GET_RE_DEBUG_FLAGS;
12978
12979     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12980
12981     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12982     assert(! (node_p && cp_count));               /* At most 1 should be set */
12983
12984     if (cp_count) {     /* Initialize return for the most common case */
12985         *cp_count = 1;
12986     }
12987
12988     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12989      * modifier.  The other meanings do not, so use a temporary until we find
12990      * out which we are being called with */
12991     skip_to_be_ignored_text(pRExC_state, &p,
12992                             FALSE /* Don't force to /x */ );
12993
12994     /* Disambiguate between \N meaning a named character versus \N meaning
12995      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12996      * quantifier, or if there is no '{' at all */
12997     if (*p != '{' || regcurly(p)) {
12998         RExC_parse = p;
12999         if (cp_count) {
13000             *cp_count = -1;
13001         }
13002
13003         if (! node_p) {
13004             return FALSE;
13005         }
13006
13007         *node_p = reg_node(pRExC_state, REG_ANY);
13008         *flagp |= HASWIDTH|SIMPLE;
13009         MARK_NAUGHTY(1);
13010         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
13011         return TRUE;
13012     }
13013
13014     /* The test above made sure that the next real character is a '{', but
13015      * under the /x modifier, it could be separated by space (or a comment and
13016      * \n) and this is not allowed (for consistency with \x{...} and the
13017      * tokenizer handling of \N{NAME}). */
13018     if (*RExC_parse != '{') {
13019         vFAIL("Missing braces on \\N{}");
13020     }
13021
13022     RExC_parse++;       /* Skip past the '{' */
13023
13024     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13025     if (! endbrace) { /* no trailing brace */
13026         vFAIL2("Missing right brace on \\%c{}", 'N');
13027     }
13028
13029     /* Here, we have decided it should be a named character or sequence.  These
13030      * imply Unicode semantics */
13031     REQUIRE_UNI_RULES(flagp, FALSE);
13032
13033     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13034      * nothing at all (not allowed under strict) */
13035     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13036         RExC_parse = endbrace;
13037         if (strict) {
13038             RExC_parse++;   /* Position after the "}" */
13039             vFAIL("Zero length \\N{}");
13040         }
13041
13042         if (cp_count) {
13043             *cp_count = 0;
13044         }
13045         nextchar(pRExC_state);
13046         if (! node_p) {
13047             return FALSE;
13048         }
13049
13050         *node_p = reg_node(pRExC_state, NOTHING);
13051         return TRUE;
13052     }
13053
13054     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13055
13056         /* Here, the name isn't of the form  U+....  This can happen if the
13057          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
13058          * is the time to find out what the name means */
13059
13060         const STRLEN name_len = endbrace - RExC_parse;
13061         SV *  value_sv;     /* What does this name evaluate to */
13062         SV ** value_svp;
13063         const U8 * value;   /* string of name's value */
13064         STRLEN value_len;   /* and its length */
13065
13066         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
13067          *  toke.c, and their values. Make sure is initialized */
13068         if (! RExC_unlexed_names) {
13069             RExC_unlexed_names = newHV();
13070         }
13071
13072         /* If we have already seen this name in this pattern, use that.  This
13073          * allows us to only call the charnames handler once per name per
13074          * pattern.  A broken or malicious handler could return something
13075          * different each time, which could cause the results to vary depending
13076          * on if something gets added or subtracted from the pattern that
13077          * causes the number of passes to change, for example */
13078         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13079                                                       name_len, 0)))
13080         {
13081             value_sv = *value_svp;
13082         }
13083         else { /* Otherwise we have to go out and get the name */
13084             const char * error_msg = NULL;
13085             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13086                                                       UTF,
13087                                                       &error_msg);
13088             if (error_msg) {
13089                 RExC_parse = endbrace;
13090                 vFAIL(error_msg);
13091             }
13092
13093             /* If no error message, should have gotten a valid return */
13094             assert (value_sv);
13095
13096             /* Save the name's meaning for later use */
13097             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13098                            value_sv, 0))
13099             {
13100                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13101             }
13102         }
13103
13104         /* Here, we have the value the name evaluates to in 'value_sv' */
13105         value = (U8 *) SvPV(value_sv, value_len);
13106
13107         /* See if the result is one code point vs 0 or multiple */
13108         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13109                                   ? UTF8SKIP(value)
13110                                   : 1)))
13111         {
13112             /* Here, exactly one code point.  If that isn't what is wanted,
13113              * fail */
13114             if (! code_point_p) {
13115                 RExC_parse = p;
13116                 return FALSE;
13117             }
13118
13119             /* Convert from string to numeric code point */
13120             *code_point_p = (SvUTF8(value_sv))
13121                             ? valid_utf8_to_uvchr(value, NULL)
13122                             : *value;
13123
13124             /* Have parsed this entire single code point \N{...}.  *cp_count
13125              * has already been set to 1, so don't do it again. */
13126             RExC_parse = endbrace;
13127             nextchar(pRExC_state);
13128             return TRUE;
13129         } /* End of is a single code point */
13130
13131         /* Count the code points, if caller desires.  The API says to do this
13132          * even if we will later return FALSE */
13133         if (cp_count) {
13134             *cp_count = 0;
13135
13136             *cp_count = (SvUTF8(value_sv))
13137                         ? utf8_length(value, value + value_len)
13138                         : value_len;
13139         }
13140
13141         /* Fail if caller doesn't want to handle a multi-code-point sequence.
13142          * But don't back the pointer up if the caller wants to know how many
13143          * code points there are (they need to handle it themselves in this
13144          * case).  */
13145         if (! node_p) {
13146             if (! cp_count) {
13147                 RExC_parse = p;
13148             }
13149             return FALSE;
13150         }
13151
13152         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13153          * reg recursively to parse it.  That way, it retains its atomicness,
13154          * while not having to worry about any special handling that some code
13155          * points may have. */
13156
13157         substitute_parse = newSVpvs("?:");
13158         sv_catsv(substitute_parse, value_sv);
13159         sv_catpv(substitute_parse, ")");
13160
13161         /* The value should already be native, so no need to convert on EBCDIC
13162          * platforms.*/
13163         assert(! RExC_recode_x_to_native);
13164
13165     }
13166     else {   /* \N{U+...} */
13167         Size_t count = 0;   /* code point count kept internally */
13168
13169         /* We can get to here when the input is \N{U+...} or when toke.c has
13170          * converted a name to the \N{U+...} form.  This include changing a
13171          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13172
13173         RExC_parse += 2;    /* Skip past the 'U+' */
13174
13175         /* Code points are separated by dots.  The '}' terminates the whole
13176          * thing. */
13177
13178         do {    /* Loop until the ending brace */
13179             I32 flags = PERL_SCAN_SILENT_OVERFLOW
13180                       | PERL_SCAN_SILENT_ILLDIGIT
13181                       | PERL_SCAN_NOTIFY_ILLDIGIT
13182                       | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13183                       | PERL_SCAN_DISALLOW_PREFIX;
13184             STRLEN len = endbrace - RExC_parse;
13185             NV overflow_value;
13186             char * start_digit = RExC_parse;
13187             UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13188
13189             if (len == 0) {
13190                 RExC_parse++;
13191               bad_NU:
13192                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13193             }
13194
13195             RExC_parse += len;
13196
13197             if (cp > MAX_LEGAL_CP) {
13198                 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13199             }
13200
13201             if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13202                 if (count) {
13203                     goto do_concat;
13204                 }
13205
13206                 /* Here, is a single code point; fail if doesn't want that */
13207                 if (! code_point_p) {
13208                     RExC_parse = p;
13209                     return FALSE;
13210                 }
13211
13212                 /* A single code point is easy to handle; just return it */
13213                 *code_point_p = UNI_TO_NATIVE(cp);
13214                 RExC_parse = endbrace;
13215                 nextchar(pRExC_state);
13216                 return TRUE;
13217             }
13218
13219             /* Here, the parse stopped bfore the ending brace.  This is legal
13220              * only if that character is a dot separating code points, like a
13221              * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13222              * So the next character must be a dot (and the one after that
13223              * can't be the endbrace, or we'd have something like \N{U+100.} )
13224              * */
13225             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13226                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13227                               ? UTF8SKIP(RExC_parse)
13228                               : 1;
13229                 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13230                                                           malformed utf8 */
13231                 goto bad_NU;
13232             }
13233
13234             /* Here, looks like its really a multiple character sequence.  Fail
13235              * if that's not what the caller wants.  But continue with counting
13236              * and error checking if they still want a count */
13237             if (! node_p && ! cp_count) {
13238                 return FALSE;
13239             }
13240
13241             /* What is done here is to convert this to a sub-pattern of the
13242              * form \x{char1}\x{char2}...  and then call reg recursively to
13243              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13244              * atomicness, while not having to worry about special handling
13245              * that some code points may have.  We don't create a subpattern,
13246              * but go through the motions of code point counting and error
13247              * checking, if the caller doesn't want a node returned. */
13248
13249             if (node_p && ! substitute_parse) {
13250                 substitute_parse = newSVpvs("?:");
13251             }
13252
13253           do_concat:
13254
13255             if (node_p) {
13256                 /* Convert to notation the rest of the code understands */
13257                 sv_catpvs(substitute_parse, "\\x{");
13258                 sv_catpvn(substitute_parse, start_digit,
13259                                             RExC_parse - start_digit);
13260                 sv_catpvs(substitute_parse, "}");
13261             }
13262
13263             /* Move to after the dot (or ending brace the final time through.)
13264              * */
13265             RExC_parse++;
13266             count++;
13267
13268         } while (RExC_parse < endbrace);
13269
13270         if (! node_p) { /* Doesn't want the node */
13271             assert (cp_count);
13272
13273             *cp_count = count;
13274             return FALSE;
13275         }
13276
13277         sv_catpvs(substitute_parse, ")");
13278
13279         /* The values are Unicode, and therefore have to be converted to native
13280          * on a non-Unicode (meaning non-ASCII) platform. */
13281         SET_recode_x_to_native(1);
13282     }
13283
13284     /* Here, we have the string the name evaluates to, ready to be parsed,
13285      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13286      * constructs.  This can be called from within a substitute parse already.
13287      * The error reporting mechanism doesn't work for 2 levels of this, but the
13288      * code above has validated this new construct, so there should be no
13289      * errors generated by the below.  And this isn' an exact copy, so the
13290      * mechanism to seamlessly deal with this won't work, so turn off warnings
13291      * during it */
13292     save_start = RExC_start;
13293     orig_end = RExC_end;
13294
13295     RExC_parse = RExC_start = SvPVX(substitute_parse);
13296     RExC_end = RExC_parse + SvCUR(substitute_parse);
13297     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13298
13299     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13300
13301     /* Restore the saved values */
13302     RESTORE_WARNINGS;
13303     RExC_start = save_start;
13304     RExC_parse = endbrace;
13305     RExC_end = orig_end;
13306     SET_recode_x_to_native(0);
13307
13308     SvREFCNT_dec_NN(substitute_parse);
13309
13310     if (! *node_p) {
13311         RETURN_FAIL_ON_RESTART(flags, flagp);
13312         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13313             (UV) flags);
13314     }
13315     *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13316
13317     nextchar(pRExC_state);
13318
13319     return TRUE;
13320 }
13321
13322
13323 STATIC U8
13324 S_compute_EXACTish(RExC_state_t *pRExC_state)
13325 {
13326     U8 op;
13327
13328     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13329
13330     if (! FOLD) {
13331         return (LOC)
13332                 ? EXACTL
13333                 : EXACT;
13334     }
13335
13336     op = get_regex_charset(RExC_flags);
13337     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13338         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13339                  been, so there is no hole */
13340     }
13341
13342     return op + EXACTF;
13343 }
13344
13345 STATIC bool
13346 S_new_regcurly(const char *s, const char *e)
13347 {
13348     /* This is a temporary function designed to match the most lenient form of
13349      * a {m,n} quantifier we ever envision, with either number omitted, and
13350      * spaces anywhere between/before/after them.
13351      *
13352      * If this function fails, then the string it matches is very unlikely to
13353      * ever be considered a valid quantifier, so we can allow the '{' that
13354      * begins it to be considered as a literal */
13355
13356     bool has_min = FALSE;
13357     bool has_max = FALSE;
13358
13359     PERL_ARGS_ASSERT_NEW_REGCURLY;
13360
13361     if (s >= e || *s++ != '{')
13362         return FALSE;
13363
13364     while (s < e && isSPACE(*s)) {
13365         s++;
13366     }
13367     while (s < e && isDIGIT(*s)) {
13368         has_min = TRUE;
13369         s++;
13370     }
13371     while (s < e && isSPACE(*s)) {
13372         s++;
13373     }
13374
13375     if (*s == ',') {
13376         s++;
13377         while (s < e && isSPACE(*s)) {
13378             s++;
13379         }
13380         while (s < e && isDIGIT(*s)) {
13381             has_max = TRUE;
13382             s++;
13383         }
13384         while (s < e && isSPACE(*s)) {
13385             s++;
13386         }
13387     }
13388
13389     return s < e && *s == '}' && (has_min || has_max);
13390 }
13391
13392 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13393  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13394
13395 static I32
13396 S_backref_value(char *p, char *e)
13397 {
13398     const char* endptr = e;
13399     UV val;
13400     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13401         return (I32)val;
13402     return I32_MAX;
13403 }
13404
13405
13406 /*
13407  - regatom - the lowest level
13408
13409    Try to identify anything special at the start of the current parse position.
13410    If there is, then handle it as required. This may involve generating a
13411    single regop, such as for an assertion; or it may involve recursing, such as
13412    to handle a () structure.
13413
13414    If the string doesn't start with something special then we gobble up
13415    as much literal text as we can.  If we encounter a quantifier, we have to
13416    back off the final literal character, as that quantifier applies to just it
13417    and not to the whole string of literals.
13418
13419    Once we have been able to handle whatever type of thing started the
13420    sequence, we return the offset into the regex engine program being compiled
13421    at which any  next regnode should be placed.
13422
13423    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13424    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13425    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13426    Otherwise does not return 0.
13427
13428    Note: we have to be careful with escapes, as they can be both literal
13429    and special, and in the case of \10 and friends, context determines which.
13430
13431    A summary of the code structure is:
13432
13433    switch (first_byte) {
13434         cases for each special:
13435             handle this special;
13436             break;
13437         case '\\':
13438             switch (2nd byte) {
13439                 cases for each unambiguous special:
13440                     handle this special;
13441                     break;
13442                 cases for each ambigous special/literal:
13443                     disambiguate;
13444                     if (special)  handle here
13445                     else goto defchar;
13446                 default: // unambiguously literal:
13447                     goto defchar;
13448             }
13449         default:  // is a literal char
13450             // FALL THROUGH
13451         defchar:
13452             create EXACTish node for literal;
13453             while (more input and node isn't full) {
13454                 switch (input_byte) {
13455                    cases for each special;
13456                        make sure parse pointer is set so that the next call to
13457                            regatom will see this special first
13458                        goto loopdone; // EXACTish node terminated by prev. char
13459                    default:
13460                        append char to EXACTISH node;
13461                 }
13462                 get next input byte;
13463             }
13464         loopdone:
13465    }
13466    return the generated node;
13467
13468    Specifically there are two separate switches for handling
13469    escape sequences, with the one for handling literal escapes requiring
13470    a dummy entry for all of the special escapes that are actually handled
13471    by the other.
13472
13473 */
13474
13475 STATIC regnode_offset
13476 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13477 {
13478     regnode_offset ret = 0;
13479     I32 flags = 0;
13480     char *parse_start;
13481     U8 op;
13482     int invert = 0;
13483
13484     DECLARE_AND_GET_RE_DEBUG_FLAGS;
13485
13486     *flagp = 0;         /* Initialize. */
13487
13488     DEBUG_PARSE("atom");
13489
13490     PERL_ARGS_ASSERT_REGATOM;
13491
13492   tryagain:
13493     parse_start = RExC_parse;
13494     assert(RExC_parse < RExC_end);
13495     switch ((U8)*RExC_parse) {
13496     case '^':
13497         RExC_seen_zerolen++;
13498         nextchar(pRExC_state);
13499         if (RExC_flags & RXf_PMf_MULTILINE)
13500             ret = reg_node(pRExC_state, MBOL);
13501         else
13502             ret = reg_node(pRExC_state, SBOL);
13503         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13504         break;
13505     case '$':
13506         nextchar(pRExC_state);
13507         if (*RExC_parse)
13508             RExC_seen_zerolen++;
13509         if (RExC_flags & RXf_PMf_MULTILINE)
13510             ret = reg_node(pRExC_state, MEOL);
13511         else
13512             ret = reg_node(pRExC_state, SEOL);
13513         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13514         break;
13515     case '.':
13516         nextchar(pRExC_state);
13517         if (RExC_flags & RXf_PMf_SINGLELINE)
13518             ret = reg_node(pRExC_state, SANY);
13519         else
13520             ret = reg_node(pRExC_state, REG_ANY);
13521         *flagp |= HASWIDTH|SIMPLE;
13522         MARK_NAUGHTY(1);
13523         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13524         break;
13525     case '[':
13526     {
13527         char * const oregcomp_parse = ++RExC_parse;
13528         ret = regclass(pRExC_state, flagp, depth+1,
13529                        FALSE, /* means parse the whole char class */
13530                        TRUE, /* allow multi-char folds */
13531                        FALSE, /* don't silence non-portable warnings. */
13532                        (bool) RExC_strict,
13533                        TRUE, /* Allow an optimized regnode result */
13534                        NULL);
13535         if (ret == 0) {
13536             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13537             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13538                   (UV) *flagp);
13539         }
13540         if (*RExC_parse != ']') {
13541             RExC_parse = oregcomp_parse;
13542             vFAIL("Unmatched [");
13543         }
13544         nextchar(pRExC_state);
13545         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13546         break;
13547     }
13548     case '(':
13549         nextchar(pRExC_state);
13550         ret = reg(pRExC_state, 2, &flags, depth+1);
13551         if (ret == 0) {
13552                 if (flags & TRYAGAIN) {
13553                     if (RExC_parse >= RExC_end) {
13554                          /* Make parent create an empty node if needed. */
13555                         *flagp |= TRYAGAIN;
13556                         return(0);
13557                     }
13558                     goto tryagain;
13559                 }
13560                 RETURN_FAIL_ON_RESTART(flags, flagp);
13561                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13562                                                                  (UV) flags);
13563         }
13564         *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13565         break;
13566     case '|':
13567     case ')':
13568         if (flags & TRYAGAIN) {
13569             *flagp |= TRYAGAIN;
13570             return 0;
13571         }
13572         vFAIL("Internal urp");
13573                                 /* Supposed to be caught earlier. */
13574         break;
13575     case '?':
13576     case '+':
13577     case '*':
13578         RExC_parse++;
13579         vFAIL("Quantifier follows nothing");
13580         break;
13581     case '\\':
13582         /* Special Escapes
13583
13584            This switch handles escape sequences that resolve to some kind
13585            of special regop and not to literal text. Escape sequences that
13586            resolve to literal text are handled below in the switch marked
13587            "Literal Escapes".
13588
13589            Every entry in this switch *must* have a corresponding entry
13590            in the literal escape switch. However, the opposite is not
13591            required, as the default for this switch is to jump to the
13592            literal text handling code.
13593         */
13594         RExC_parse++;
13595         switch ((U8)*RExC_parse) {
13596         /* Special Escapes */
13597         case 'A':
13598             RExC_seen_zerolen++;
13599             /* Under wildcards, this is changed to match \n; should be
13600              * invisible to the user, as they have to compile under /m */
13601             if (RExC_pm_flags & PMf_WILDCARD) {
13602                 ret = reg_node(pRExC_state, MBOL);
13603             }
13604             else {
13605                 ret = reg_node(pRExC_state, SBOL);
13606                 /* SBOL is shared with /^/ so we set the flags so we can tell
13607                  * /\A/ from /^/ in split. */
13608                 FLAGS(REGNODE_p(ret)) = 1;
13609             }
13610             goto finish_meta_pat;
13611         case 'G':
13612             if (RExC_pm_flags & PMf_WILDCARD) {
13613                 RExC_parse++;
13614                 /* diag_listed_as: Use of %s is not allowed in Unicode property
13615                    wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13616                  */
13617                 vFAIL("Use of '\\G' is not allowed in Unicode property"
13618                       " wildcard subpatterns");
13619             }
13620             ret = reg_node(pRExC_state, GPOS);
13621             RExC_seen |= REG_GPOS_SEEN;
13622             goto finish_meta_pat;
13623         case 'K':
13624             if (!RExC_in_lookaround) {
13625                 RExC_seen_zerolen++;
13626                 ret = reg_node(pRExC_state, KEEPS);
13627                 /* XXX:dmq : disabling in-place substitution seems to
13628                  * be necessary here to avoid cases of memory corruption, as
13629                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13630                  */
13631                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13632                 goto finish_meta_pat;
13633             }
13634             else {
13635                 ++RExC_parse; /* advance past the 'K' */
13636                 vFAIL("\\K not permitted in lookahead/lookbehind");
13637             }
13638         case 'Z':
13639             if (RExC_pm_flags & PMf_WILDCARD) {
13640                 /* See comment under \A above */
13641                 ret = reg_node(pRExC_state, MEOL);
13642             }
13643             else {
13644                 ret = reg_node(pRExC_state, SEOL);
13645             }
13646             RExC_seen_zerolen++;                /* Do not optimize RE away */
13647             goto finish_meta_pat;
13648         case 'z':
13649             if (RExC_pm_flags & PMf_WILDCARD) {
13650                 /* See comment under \A above */
13651                 ret = reg_node(pRExC_state, MEOL);
13652             }
13653             else {
13654                 ret = reg_node(pRExC_state, EOS);
13655             }
13656             RExC_seen_zerolen++;                /* Do not optimize RE away */
13657             goto finish_meta_pat;
13658         case 'C':
13659             vFAIL("\\C no longer supported");
13660         case 'X':
13661             ret = reg_node(pRExC_state, CLUMP);
13662             *flagp |= HASWIDTH;
13663             goto finish_meta_pat;
13664
13665         case 'B':
13666             invert = 1;
13667             /* FALLTHROUGH */
13668         case 'b':
13669           {
13670             U8 flags = 0;
13671             regex_charset charset = get_regex_charset(RExC_flags);
13672
13673             RExC_seen_zerolen++;
13674             RExC_seen |= REG_LOOKBEHIND_SEEN;
13675             op = BOUND + charset;
13676
13677             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13678                 flags = TRADITIONAL_BOUND;
13679                 if (op > BOUNDA) {  /* /aa is same as /a */
13680                     op = BOUNDA;
13681                 }
13682             }
13683             else {
13684                 STRLEN length;
13685                 char name = *RExC_parse;
13686                 char * endbrace = NULL;
13687                 RExC_parse += 2;
13688                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13689
13690                 if (! endbrace) {
13691                     vFAIL2("Missing right brace on \\%c{}", name);
13692                 }
13693                 /* XXX Need to decide whether to take spaces or not.  Should be
13694                  * consistent with \p{}, but that currently is SPACE, which
13695                  * means vertical too, which seems wrong
13696                  * while (isBLANK(*RExC_parse)) {
13697                     RExC_parse++;
13698                 }*/
13699                 if (endbrace == RExC_parse) {
13700                     RExC_parse++;  /* After the '}' */
13701                     vFAIL2("Empty \\%c{}", name);
13702                 }
13703                 length = endbrace - RExC_parse;
13704                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13705                     length--;
13706                 }*/
13707                 switch (*RExC_parse) {
13708                     case 'g':
13709                         if (    length != 1
13710                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13711                         {
13712                             goto bad_bound_type;
13713                         }
13714                         flags = GCB_BOUND;
13715                         break;
13716                     case 'l':
13717                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13718                             goto bad_bound_type;
13719                         }
13720                         flags = LB_BOUND;
13721                         break;
13722                     case 's':
13723                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13724                             goto bad_bound_type;
13725                         }
13726                         flags = SB_BOUND;
13727                         break;
13728                     case 'w':
13729                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13730                             goto bad_bound_type;
13731                         }
13732                         flags = WB_BOUND;
13733                         break;
13734                     default:
13735                       bad_bound_type:
13736                         RExC_parse = endbrace;
13737                         vFAIL2utf8f(
13738                             "'%" UTF8f "' is an unknown bound type",
13739                             UTF8fARG(UTF, length, endbrace - length));
13740                         NOT_REACHED; /*NOTREACHED*/
13741                 }
13742                 RExC_parse = endbrace;
13743                 REQUIRE_UNI_RULES(flagp, 0);
13744
13745                 if (op == BOUND) {
13746                     op = BOUNDU;
13747                 }
13748                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13749                     op = BOUNDU;
13750                     length += 4;
13751
13752                     /* Don't have to worry about UTF-8, in this message because
13753                      * to get here the contents of the \b must be ASCII */
13754                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13755                               "Using /u for '%.*s' instead of /%s",
13756                               (unsigned) length,
13757                               endbrace - length + 1,
13758                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13759                               ? ASCII_RESTRICT_PAT_MODS
13760                               : ASCII_MORE_RESTRICT_PAT_MODS);
13761                 }
13762             }
13763
13764             if (op == BOUND) {
13765                 RExC_seen_d_op = TRUE;
13766             }
13767             else if (op == BOUNDL) {
13768                 RExC_contains_locale = 1;
13769             }
13770
13771             if (invert) {
13772                 op += NBOUND - BOUND;
13773             }
13774
13775             ret = reg_node(pRExC_state, op);
13776             FLAGS(REGNODE_p(ret)) = flags;
13777
13778             goto finish_meta_pat;
13779           }
13780
13781         case 'R':
13782             ret = reg_node(pRExC_state, LNBREAK);
13783             *flagp |= HASWIDTH|SIMPLE;
13784             goto finish_meta_pat;
13785
13786         case 'd':
13787         case 'D':
13788         case 'h':
13789         case 'H':
13790         case 'p':
13791         case 'P':
13792         case 's':
13793         case 'S':
13794         case 'v':
13795         case 'V':
13796         case 'w':
13797         case 'W':
13798             /* These all have the same meaning inside [brackets], and it knows
13799              * how to do the best optimizations for them.  So, pretend we found
13800              * these within brackets, and let it do the work */
13801             RExC_parse--;
13802
13803             ret = regclass(pRExC_state, flagp, depth+1,
13804                            TRUE, /* means just parse this element */
13805                            FALSE, /* don't allow multi-char folds */
13806                            FALSE, /* don't silence non-portable warnings.  It
13807                                      would be a bug if these returned
13808                                      non-portables */
13809                            (bool) RExC_strict,
13810                            TRUE, /* Allow an optimized regnode result */
13811                            NULL);
13812             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13813             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13814              * multi-char folds are allowed.  */
13815             if (!ret)
13816                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13817                       (UV) *flagp);
13818
13819             RExC_parse--;   /* regclass() leaves this one too far ahead */
13820
13821           finish_meta_pat:
13822                    /* The escapes above that don't take a parameter can't be
13823                     * followed by a '{'.  But 'pX', 'p{foo}' and
13824                     * correspondingly 'P' can be */
13825             if (   RExC_parse - parse_start == 1
13826                 && UCHARAT(RExC_parse + 1) == '{'
13827                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13828             {
13829                 RExC_parse += 2;
13830                 vFAIL("Unescaped left brace in regex is illegal here");
13831             }
13832             Set_Node_Offset(REGNODE_p(ret), parse_start);
13833             Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13834             nextchar(pRExC_state);
13835             break;
13836         case 'N':
13837             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13838              * \N{...} evaluates to a sequence of more than one code points).
13839              * The function call below returns a regnode, which is our result.
13840              * The parameters cause it to fail if the \N{} evaluates to a
13841              * single code point; we handle those like any other literal.  The
13842              * reason that the multicharacter case is handled here and not as
13843              * part of the EXACtish code is because of quantifiers.  In
13844              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13845              * this way makes that Just Happen. dmq.
13846              * join_exact() will join this up with adjacent EXACTish nodes
13847              * later on, if appropriate. */
13848             ++RExC_parse;
13849             if (grok_bslash_N(pRExC_state,
13850                               &ret,     /* Want a regnode returned */
13851                               NULL,     /* Fail if evaluates to a single code
13852                                            point */
13853                               NULL,     /* Don't need a count of how many code
13854                                            points */
13855                               flagp,
13856                               RExC_strict,
13857                               depth)
13858             ) {
13859                 break;
13860             }
13861
13862             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13863
13864             /* Here, evaluates to a single code point.  Go get that */
13865             RExC_parse = parse_start;
13866             goto defchar;
13867
13868         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13869       parse_named_seq:
13870         {
13871             char ch;
13872             if (   RExC_parse >= RExC_end - 1
13873                 || ((   ch = RExC_parse[1]) != '<'
13874                                       && ch != '\''
13875                                       && ch != '{'))
13876             {
13877                 RExC_parse++;
13878                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13879                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13880             } else {
13881                 RExC_parse += 2;
13882                 ret = handle_named_backref(pRExC_state,
13883                                            flagp,
13884                                            parse_start,
13885                                            (ch == '<')
13886                                            ? '>'
13887                                            : (ch == '{')
13888                                              ? '}'
13889                                              : '\'');
13890             }
13891             break;
13892         }
13893         case 'g':
13894         case '1': case '2': case '3': case '4':
13895         case '5': case '6': case '7': case '8': case '9':
13896             {
13897                 I32 num;
13898                 bool hasbrace = 0;
13899
13900                 if (*RExC_parse == 'g') {
13901                     bool isrel = 0;
13902
13903                     RExC_parse++;
13904                     if (*RExC_parse == '{') {
13905                         RExC_parse++;
13906                         hasbrace = 1;
13907                     }
13908                     if (*RExC_parse == '-') {
13909                         RExC_parse++;
13910                         isrel = 1;
13911                     }
13912                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13913                         if (isrel) RExC_parse--;
13914                         RExC_parse -= 2;
13915                         goto parse_named_seq;
13916                     }
13917
13918                     if (RExC_parse >= RExC_end) {
13919                         goto unterminated_g;
13920                     }
13921                     num = S_backref_value(RExC_parse, RExC_end);
13922                     if (num == 0)
13923                         vFAIL("Reference to invalid group 0");
13924                     else if (num == I32_MAX) {
13925                          if (isDIGIT(*RExC_parse))
13926                             vFAIL("Reference to nonexistent group");
13927                         else
13928                           unterminated_g:
13929                             vFAIL("Unterminated \\g... pattern");
13930                     }
13931
13932                     if (isrel) {
13933                         num = RExC_npar - num;
13934                         if (num < 1)
13935                             vFAIL("Reference to nonexistent or unclosed group");
13936                     }
13937                 }
13938                 else {
13939                     num = S_backref_value(RExC_parse, RExC_end);
13940                     /* bare \NNN might be backref or octal - if it is larger
13941                      * than or equal RExC_npar then it is assumed to be an
13942                      * octal escape. Note RExC_npar is +1 from the actual
13943                      * number of parens. */
13944                     /* Note we do NOT check if num == I32_MAX here, as that is
13945                      * handled by the RExC_npar check */
13946
13947                     if (
13948                         /* any numeric escape < 10 is always a backref */
13949                         num > 9
13950                         /* any numeric escape < RExC_npar is a backref */
13951                         && num >= RExC_npar
13952                         /* cannot be an octal escape if it starts with [89] */
13953                         && ! inRANGE(*RExC_parse, '8', '9')
13954                     ) {
13955                         /* Probably not meant to be a backref, instead likely
13956                          * to be an octal character escape, e.g. \35 or \777.
13957                          * The above logic should make it obvious why using
13958                          * octal escapes in patterns is problematic. - Yves */
13959                         RExC_parse = parse_start;
13960                         goto defchar;
13961                     }
13962                 }
13963
13964                 /* At this point RExC_parse points at a numeric escape like
13965                  * \12 or \88 or something similar, which we should NOT treat
13966                  * as an octal escape. It may or may not be a valid backref
13967                  * escape. For instance \88888888 is unlikely to be a valid
13968                  * backref. */
13969                 while (isDIGIT(*RExC_parse))
13970                     RExC_parse++;
13971                 if (hasbrace) {
13972                     if (*RExC_parse != '}')
13973                         vFAIL("Unterminated \\g{...} pattern");
13974                     RExC_parse++;
13975                 }
13976                 if (num >= (I32)RExC_npar) {
13977
13978                     /* It might be a forward reference; we can't fail until we
13979                      * know, by completing the parse to get all the groups, and
13980                      * then reparsing */
13981                     if (ALL_PARENS_COUNTED)  {
13982                         if (num >= RExC_total_parens)  {
13983                             vFAIL("Reference to nonexistent group");
13984                         }
13985                     }
13986                     else {
13987                         REQUIRE_PARENS_PASS;
13988                     }
13989                 }
13990                 RExC_sawback = 1;
13991                 ret = reganode(pRExC_state,
13992                                ((! FOLD)
13993                                  ? REF
13994                                  : (ASCII_FOLD_RESTRICTED)
13995                                    ? REFFA
13996                                    : (AT_LEAST_UNI_SEMANTICS)
13997                                      ? REFFU
13998                                      : (LOC)
13999                                        ? REFFL
14000                                        : REFF),
14001                                 num);
14002                 if (OP(REGNODE_p(ret)) == REFF) {
14003                     RExC_seen_d_op = TRUE;
14004                 }
14005                 *flagp |= HASWIDTH;
14006
14007                 /* override incorrect value set in reganode MJD */
14008                 Set_Node_Offset(REGNODE_p(ret), parse_start);
14009                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
14010                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14011                                         FALSE /* Don't force to /x */ );
14012             }
14013             break;
14014         case '\0':
14015             if (RExC_parse >= RExC_end)
14016                 FAIL("Trailing \\");
14017             /* FALLTHROUGH */
14018         default:
14019             /* Do not generate "unrecognized" warnings here, we fall
14020                back into the quick-grab loop below */
14021             RExC_parse = parse_start;
14022             goto defchar;
14023         } /* end of switch on a \foo sequence */
14024         break;
14025
14026     case '#':
14027
14028         /* '#' comments should have been spaced over before this function was
14029          * called */
14030         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14031         /*
14032         if (RExC_flags & RXf_PMf_EXTENDED) {
14033             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14034             if (RExC_parse < RExC_end)
14035                 goto tryagain;
14036         }
14037         */
14038
14039         /* FALLTHROUGH */
14040
14041     default:
14042           defchar: {
14043
14044             /* Here, we have determined that the next thing is probably a
14045              * literal character.  RExC_parse points to the first byte of its
14046              * definition.  (It still may be an escape sequence that evaluates
14047              * to a single character) */
14048
14049             STRLEN len = 0;
14050             UV ender = 0;
14051             char *p;
14052             char *s, *old_s = NULL, *old_old_s = NULL;
14053             char *s0;
14054             U32 max_string_len = 255;
14055
14056             /* We may have to reparse the node, artificially stopping filling
14057              * it early, based on info gleaned in the first parse.  This
14058              * variable gives where we stop.  Make it above the normal stopping
14059              * place first time through; otherwise it would stop too early */
14060             U32 upper_fill = max_string_len + 1;
14061
14062             /* We start out as an EXACT node, even if under /i, until we find a
14063              * character which is in a fold.  The algorithm now segregates into
14064              * separate nodes, characters that fold from those that don't under
14065              * /i.  (This hopefully will create nodes that are fixed strings
14066              * even under /i, giving the optimizer something to grab on to.)
14067              * So, if a node has something in it and the next character is in
14068              * the opposite category, that node is closed up, and the function
14069              * returns.  Then regatom is called again, and a new node is
14070              * created for the new category. */
14071             U8 node_type = EXACT;
14072
14073             /* Assume the node will be fully used; the excess is given back at
14074              * the end.  Under /i, we may need to temporarily add the fold of
14075              * an extra character or two at the end to check for splitting
14076              * multi-char folds, so allocate extra space for that.   We can't
14077              * make any other length assumptions, as a byte input sequence
14078              * could shrink down. */
14079             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14080                                                  + ((! FOLD)
14081                                                     ? 0
14082                                                     : 2 * ((UTF)
14083                                                            ? UTF8_MAXBYTES_CASE
14084                         /* Max non-UTF-8 expansion is 2 */ : 2)));
14085
14086             bool next_is_quantifier;
14087             char * oldp = NULL;
14088
14089             /* We can convert EXACTF nodes to EXACTFU if they contain only
14090              * characters that match identically regardless of the target
14091              * string's UTF8ness.  The reason to do this is that EXACTF is not
14092              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14093              * runtime.
14094              *
14095              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14096              * contain only above-Latin1 characters (hence must be in UTF8),
14097              * which don't participate in folds with Latin1-range characters,
14098              * as the latter's folds aren't known until runtime. */
14099             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14100
14101             /* Single-character EXACTish nodes are almost always SIMPLE.  This
14102              * allows us to override this as encountered */
14103             U8 maybe_SIMPLE = SIMPLE;
14104
14105             /* Does this node contain something that can't match unless the
14106              * target string is (also) in UTF-8 */
14107             bool requires_utf8_target = FALSE;
14108
14109             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14110             bool has_ss = FALSE;
14111
14112             /* So is the MICRO SIGN */
14113             bool has_micro_sign = FALSE;
14114
14115             /* Set when we fill up the current node and there is still more
14116              * text to process */
14117             bool overflowed;
14118
14119             /* Allocate an EXACT node.  The node_type may change below to
14120              * another EXACTish node, but since the size of the node doesn't
14121              * change, it works */
14122             ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14123                                                                     "exact");
14124             FILL_NODE(ret, node_type);
14125             RExC_emit++;
14126
14127             s = STRING(REGNODE_p(ret));
14128
14129             s0 = s;
14130
14131           reparse:
14132
14133             p = RExC_parse;
14134             len = 0;
14135             s = s0;
14136             node_type = EXACT;
14137             oldp = NULL;
14138             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14139             maybe_SIMPLE = SIMPLE;
14140             requires_utf8_target = FALSE;
14141             has_ss = FALSE;
14142             has_micro_sign = FALSE;
14143
14144           continue_parse:
14145
14146             /* This breaks under rare circumstances.  If folding, we do not
14147              * want to split a node at a character that is a non-final in a
14148              * multi-char fold, as an input string could just happen to want to
14149              * match across the node boundary.  The code at the end of the loop
14150              * looks for this, and backs off until it finds not such a
14151              * character, but it is possible (though extremely, extremely
14152              * unlikely) for all characters in the node to be non-final fold
14153              * ones, in which case we just leave the node fully filled, and
14154              * hope that it doesn't match the string in just the wrong place */
14155
14156             assert( ! UTF     /* Is at the beginning of a character */
14157                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14158                    || UTF8_IS_START(UCHARAT(RExC_parse)));
14159
14160             overflowed = FALSE;
14161
14162             /* Here, we have a literal character.  Find the maximal string of
14163              * them in the input that we can fit into a single EXACTish node.
14164              * We quit at the first non-literal or when the node gets full, or
14165              * under /i the categorization of folding/non-folding character
14166              * changes */
14167             while (p < RExC_end && len < upper_fill) {
14168
14169                 /* In most cases each iteration adds one byte to the output.
14170                  * The exceptions override this */
14171                 Size_t added_len = 1;
14172
14173                 oldp = p;
14174                 old_old_s = old_s;
14175                 old_s = s;
14176
14177                 /* White space has already been ignored */
14178                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
14179                        || ! is_PATWS_safe((p), RExC_end, UTF));
14180
14181                 switch ((U8)*p) {
14182                   const char* message;
14183                   U32 packed_warn;
14184                   U8 grok_c_char;
14185
14186                 case '^':
14187                 case '$':
14188                 case '.':
14189                 case '[':
14190                 case '(':
14191                 case ')':
14192                 case '|':
14193                     goto loopdone;
14194                 case '\\':
14195                     /* Literal Escapes Switch
14196
14197                        This switch is meant to handle escape sequences that
14198                        resolve to a literal character.
14199
14200                        Every escape sequence that represents something
14201                        else, like an assertion or a char class, is handled
14202                        in the switch marked 'Special Escapes' above in this
14203                        routine, but also has an entry here as anything that
14204                        isn't explicitly mentioned here will be treated as
14205                        an unescaped equivalent literal.
14206                     */
14207
14208                     switch ((U8)*++p) {
14209
14210                     /* These are all the special escapes. */
14211                     case 'A':             /* Start assertion */
14212                     case 'b': case 'B':   /* Word-boundary assertion*/
14213                     case 'C':             /* Single char !DANGEROUS! */
14214                     case 'd': case 'D':   /* digit class */
14215                     case 'g': case 'G':   /* generic-backref, pos assertion */
14216                     case 'h': case 'H':   /* HORIZWS */
14217                     case 'k': case 'K':   /* named backref, keep marker */
14218                     case 'p': case 'P':   /* Unicode property */
14219                               case 'R':   /* LNBREAK */
14220                     case 's': case 'S':   /* space class */
14221                     case 'v': case 'V':   /* VERTWS */
14222                     case 'w': case 'W':   /* word class */
14223                     case 'X':             /* eXtended Unicode "combining
14224                                              character sequence" */
14225                     case 'z': case 'Z':   /* End of line/string assertion */
14226                         --p;
14227                         goto loopdone;
14228
14229                     /* Anything after here is an escape that resolves to a
14230                        literal. (Except digits, which may or may not)
14231                      */
14232                     case 'n':
14233                         ender = '\n';
14234                         p++;
14235                         break;
14236                     case 'N': /* Handle a single-code point named character. */
14237                         RExC_parse = p + 1;
14238                         if (! grok_bslash_N(pRExC_state,
14239                                             NULL,   /* Fail if evaluates to
14240                                                        anything other than a
14241                                                        single code point */
14242                                             &ender, /* The returned single code
14243                                                        point */
14244                                             NULL,   /* Don't need a count of
14245                                                        how many code points */
14246                                             flagp,
14247                                             RExC_strict,
14248                                             depth)
14249                         ) {
14250                             if (*flagp & NEED_UTF8)
14251                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14252                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14253
14254                             /* Here, it wasn't a single code point.  Go close
14255                              * up this EXACTish node.  The switch() prior to
14256                              * this switch handles the other cases */
14257                             RExC_parse = p = oldp;
14258                             goto loopdone;
14259                         }
14260                         p = RExC_parse;
14261                         RExC_parse = parse_start;
14262
14263                         /* The \N{} means the pattern, if previously /d,
14264                          * becomes /u.  That means it can't be an EXACTF node,
14265                          * but an EXACTFU */
14266                         if (node_type == EXACTF) {
14267                             node_type = EXACTFU;
14268
14269                             /* If the node already contains something that
14270                              * differs between EXACTF and EXACTFU, reparse it
14271                              * as EXACTFU */
14272                             if (! maybe_exactfu) {
14273                                 len = 0;
14274                                 s = s0;
14275                                 goto reparse;
14276                             }
14277                         }
14278
14279                         break;
14280                     case 'r':
14281                         ender = '\r';
14282                         p++;
14283                         break;
14284                     case 't':
14285                         ender = '\t';
14286                         p++;
14287                         break;
14288                     case 'f':
14289                         ender = '\f';
14290                         p++;
14291                         break;
14292                     case 'e':
14293                         ender = ESC_NATIVE;
14294                         p++;
14295                         break;
14296                     case 'a':
14297                         ender = '\a';
14298                         p++;
14299                         break;
14300                     case 'o':
14301                         if (! grok_bslash_o(&p,
14302                                             RExC_end,
14303                                             &ender,
14304                                             &message,
14305                                             &packed_warn,
14306                                             (bool) RExC_strict,
14307                                             FALSE, /* No illegal cp's */
14308                                             UTF))
14309                         {
14310                             RExC_parse = p; /* going to die anyway; point to
14311                                                exact spot of failure */
14312                             vFAIL(message);
14313                         }
14314
14315                         if (message && TO_OUTPUT_WARNINGS(p)) {
14316                             warn_non_literal_string(p, packed_warn, message);
14317                         }
14318                         break;
14319                     case 'x':
14320                         if (! grok_bslash_x(&p,
14321                                             RExC_end,
14322                                             &ender,
14323                                             &message,
14324                                             &packed_warn,
14325                                             (bool) RExC_strict,
14326                                             FALSE, /* No illegal cp's */
14327                                             UTF))
14328                         {
14329                             RExC_parse = p;     /* going to die anyway; point
14330                                                    to exact spot of failure */
14331                             vFAIL(message);
14332                         }
14333
14334                         if (message && TO_OUTPUT_WARNINGS(p)) {
14335                             warn_non_literal_string(p, packed_warn, message);
14336                         }
14337
14338 #ifdef EBCDIC
14339                         if (ender < 0x100) {
14340                             if (RExC_recode_x_to_native) {
14341                                 ender = LATIN1_TO_NATIVE(ender);
14342                             }
14343                         }
14344 #endif
14345                         break;
14346                     case 'c':
14347                         p++;
14348                         if (! grok_bslash_c(*p, &grok_c_char,
14349                                             &message, &packed_warn))
14350                         {
14351                             /* going to die anyway; point to exact spot of
14352                              * failure */
14353                             RExC_parse = p + ((UTF)
14354                                               ? UTF8_SAFE_SKIP(p, RExC_end)
14355                                               : 1);
14356                             vFAIL(message);
14357                         }
14358
14359                         ender = grok_c_char;
14360                         p++;
14361                         if (message && TO_OUTPUT_WARNINGS(p)) {
14362                             warn_non_literal_string(p, packed_warn, message);
14363                         }
14364
14365                         break;
14366                     case '8': case '9': /* must be a backreference */
14367                         --p;
14368                         /* we have an escape like \8 which cannot be an octal escape
14369                          * so we exit the loop, and let the outer loop handle this
14370                          * escape which may or may not be a legitimate backref. */
14371                         goto loopdone;
14372                     case '1': case '2': case '3':case '4':
14373                     case '5': case '6': case '7':
14374                         /* When we parse backslash escapes there is ambiguity
14375                          * between backreferences and octal escapes. Any escape
14376                          * from \1 - \9 is a backreference, any multi-digit
14377                          * escape which does not start with 0 and which when
14378                          * evaluated as decimal could refer to an already
14379                          * parsed capture buffer is a back reference. Anything
14380                          * else is octal.
14381                          *
14382                          * Note this implies that \118 could be interpreted as
14383                          * 118 OR as "\11" . "8" depending on whether there
14384                          * were 118 capture buffers defined already in the
14385                          * pattern.  */
14386
14387                         /* NOTE, RExC_npar is 1 more than the actual number of
14388                          * parens we have seen so far, hence the "<" as opposed
14389                          * to "<=" */
14390                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14391                         {  /* Not to be treated as an octal constant, go
14392                                    find backref */
14393                             --p;
14394                             goto loopdone;
14395                         }
14396                         /* FALLTHROUGH */
14397                     case '0':
14398                         {
14399                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14400                                       | PERL_SCAN_NOTIFY_ILLDIGIT;
14401                             STRLEN numlen = 3;
14402                             ender = grok_oct(p, &numlen, &flags, NULL);
14403                             p += numlen;
14404                             if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14405                                 && isDIGIT(*p)  /* like \08, \178 */
14406                                 && ckWARN(WARN_REGEXP))
14407                             {
14408                                 reg_warn_non_literal_string(
14409                                      p + 1,
14410                                      form_alien_digit_msg(8, numlen, p,
14411                                                         RExC_end, UTF, FALSE));
14412                             }
14413                         }
14414                         break;
14415                     case '\0':
14416                         if (p >= RExC_end)
14417                             FAIL("Trailing \\");
14418                         /* FALLTHROUGH */
14419                     default:
14420                         if (isALPHANUMERIC(*p)) {
14421                             /* An alpha followed by '{' is going to fail next
14422                              * iteration, so don't output this warning in that
14423                              * case */
14424                             if (! isALPHA(*p) || *(p + 1) != '{') {
14425                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14426                                                   " passed through", p);
14427                             }
14428                         }
14429                         goto normal_default;
14430                     } /* End of switch on '\' */
14431                     break;
14432                 case '{':
14433                     /* Trying to gain new uses for '{' without breaking too
14434                      * much existing code is hard.  The solution currently
14435                      * adopted is:
14436                      *  1)  If there is no ambiguity that a '{' should always
14437                      *      be taken literally, at the start of a construct, we
14438                      *      just do so.
14439                      *  2)  If the literal '{' conflicts with our desired use
14440                      *      of it as a metacharacter, we die.  The deprecation
14441                      *      cycles for this have come and gone.
14442                      *  3)  If there is ambiguity, we raise a simple warning.
14443                      *      This could happen, for example, if the user
14444                      *      intended it to introduce a quantifier, but slightly
14445                      *      misspelled the quantifier.  Without this warning,
14446                      *      the quantifier would silently be taken as a literal
14447                      *      string of characters instead of a meta construct */
14448                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14449                         if (      RExC_strict
14450                             || (  p > parse_start + 1
14451                                 && isALPHA_A(*(p - 1))
14452                                 && *(p - 2) == '\\')
14453                             || new_regcurly(p, RExC_end))
14454                         {
14455                             RExC_parse = p + 1;
14456                             vFAIL("Unescaped left brace in regex is "
14457                                   "illegal here");
14458                         }
14459                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14460                                          " passed through");
14461                     }
14462                     goto normal_default;
14463                 case '}':
14464                 case ']':
14465                     if (p > RExC_parse && RExC_strict) {
14466                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14467                     }
14468                     /*FALLTHROUGH*/
14469                 default:    /* A literal character */
14470                   normal_default:
14471                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14472                         STRLEN numlen;
14473                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14474                                                &numlen, UTF8_ALLOW_DEFAULT);
14475                         p += numlen;
14476                     }
14477                     else
14478                         ender = (U8) *p++;
14479                     break;
14480                 } /* End of switch on the literal */
14481
14482                 /* Here, have looked at the literal character, and <ender>
14483                  * contains its ordinal; <p> points to the character after it.
14484                  * */
14485
14486                 if (ender > 255) {
14487                     REQUIRE_UTF8(flagp);
14488                     if (   UNICODE_IS_PERL_EXTENDED(ender)
14489                         && TO_OUTPUT_WARNINGS(p))
14490                     {
14491                         ckWARN2_non_literal_string(p,
14492                                                    packWARN(WARN_PORTABLE),
14493                                                    PL_extended_cp_format,
14494                                                    ender);
14495                     }
14496                 }
14497
14498                 /* We need to check if the next non-ignored thing is a
14499                  * quantifier.  Move <p> to after anything that should be
14500                  * ignored, which, as a side effect, positions <p> for the next
14501                  * loop iteration */
14502                 skip_to_be_ignored_text(pRExC_state, &p,
14503                                         FALSE /* Don't force to /x */ );
14504
14505                 /* If the next thing is a quantifier, it applies to this
14506                  * character only, which means that this character has to be in
14507                  * its own node and can't just be appended to the string in an
14508                  * existing node, so if there are already other characters in
14509                  * the node, close the node with just them, and set up to do
14510                  * this character again next time through, when it will be the
14511                  * only thing in its new node */
14512
14513                 next_is_quantifier =    LIKELY(p < RExC_end)
14514                                      && UNLIKELY(ISMULT2(p));
14515
14516                 if (next_is_quantifier && LIKELY(len)) {
14517                     p = oldp;
14518                     goto loopdone;
14519                 }
14520
14521                 /* Ready to add 'ender' to the node */
14522
14523                 if (! FOLD) {  /* The simple case, just append the literal */
14524                   not_fold_common:
14525
14526                     /* Don't output if it would overflow */
14527                     if (UNLIKELY(len > max_string_len - ((UTF)
14528                                                       ? UVCHR_SKIP(ender)
14529                                                       : 1)))
14530                     {
14531                         overflowed = TRUE;
14532                         break;
14533                     }
14534
14535                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14536                         *(s++) = (char) ender;
14537                     }
14538                     else {
14539                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14540                         added_len = (char *) new_s - s;
14541                         s = (char *) new_s;
14542
14543                         if (ender > 255)  {
14544                             requires_utf8_target = TRUE;
14545                         }
14546                     }
14547                 }
14548                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14549
14550                     /* Here are folding under /l, and the code point is
14551                      * problematic.  If this is the first character in the
14552                      * node, change the node type to folding.   Otherwise, if
14553                      * this is the first problematic character, close up the
14554                      * existing node, so can start a new node with this one */
14555                     if (! len) {
14556                         node_type = EXACTFL;
14557                         RExC_contains_locale = 1;
14558                     }
14559                     else if (node_type == EXACT) {
14560                         p = oldp;
14561                         goto loopdone;
14562                     }
14563
14564                     /* This problematic code point means we can't simplify
14565                      * things */
14566                     maybe_exactfu = FALSE;
14567
14568                     /* Although these two characters have folds that are
14569                      * locale-problematic, they also have folds to above Latin1
14570                      * that aren't a problem.  Doing these now helps at
14571                      * runtime. */
14572                     if (UNLIKELY(   ender == GREEK_CAPITAL_LETTER_MU
14573                                  || ender == LATIN_CAPITAL_LETTER_SHARP_S))
14574                     {
14575                         goto fold_anyway;
14576                     }
14577
14578                     /* Here, we are adding a problematic fold character.
14579                      * "Problematic" in this context means that its fold isn't
14580                      * known until runtime.  (The non-problematic code points
14581                      * are the above-Latin1 ones that fold to also all
14582                      * above-Latin1.  Their folds don't vary no matter what the
14583                      * locale is.) But here we have characters whose fold
14584                      * depends on the locale.  We just add in the unfolded
14585                      * character, and wait until runtime to fold it */
14586                     goto not_fold_common;
14587                 }
14588                 else /* regular fold; see if actually is in a fold */
14589                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14590                          || (ender > 255
14591                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14592                 {
14593                     /* Here, folding, but the character isn't in a fold.
14594                      *
14595                      * Start a new node if previous characters in the node were
14596                      * folded */
14597                     if (len && node_type != EXACT) {
14598                         p = oldp;
14599                         goto loopdone;
14600                     }
14601
14602                     /* Here, continuing a node with non-folded characters.  Add
14603                      * this one */
14604                     goto not_fold_common;
14605                 }
14606                 else {  /* Here, does participate in some fold */
14607
14608                     /* If this is the first character in the node, change its
14609                      * type to folding.  Otherwise, if this is the first
14610                      * folding character in the node, close up the existing
14611                      * node, so can start a new node with this one.  */
14612                     if (! len) {
14613                         node_type = compute_EXACTish(pRExC_state);
14614                     }
14615                     else if (node_type == EXACT) {
14616                         p = oldp;
14617                         goto loopdone;
14618                     }
14619
14620                     if (UTF) {  /* Alway use the folded value for UTF-8
14621                                    patterns */
14622                         if (UVCHR_IS_INVARIANT(ender)) {
14623                             if (UNLIKELY(len + 1 > max_string_len)) {
14624                                 overflowed = TRUE;
14625                                 break;
14626                             }
14627
14628                             *(s)++ = (U8) toFOLD(ender);
14629                         }
14630                         else {
14631                             UV folded;
14632
14633                           fold_anyway:
14634                             folded = _to_uni_fold_flags(
14635                                     ender,
14636                                     (U8 *) s,  /* We have allocated extra space
14637                                                   in 's' so can't run off the
14638                                                   end */
14639                                     &added_len,
14640                                     FOLD_FLAGS_FULL
14641                                   | ((   ASCII_FOLD_RESTRICTED
14642                                       || node_type == EXACTFL)
14643                                     ? FOLD_FLAGS_NOMIX_ASCII
14644                                     : 0));
14645                             if (UNLIKELY(len + added_len > max_string_len)) {
14646                                 overflowed = TRUE;
14647                                 break;
14648                             }
14649
14650                             s += added_len;
14651
14652                             if (   folded > 255
14653                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14654                             {
14655                                 /* U+B5 folds to the MU, so its possible for a
14656                                  * non-UTF-8 target to match it */
14657                                 requires_utf8_target = TRUE;
14658                             }
14659                         }
14660                     }
14661                     else { /* Here is non-UTF8. */
14662
14663                         /* The fold will be one or (rarely) two characters.
14664                          * Check that there's room for at least a single one
14665                          * before setting any flags, etc.  Because otherwise an
14666                          * overflowing character could cause a flag to be set
14667                          * even though it doesn't end up in this node.  (For
14668                          * the two character fold, we check again, before
14669                          * setting any flags) */
14670                         if (UNLIKELY(len + 1 > max_string_len)) {
14671                             overflowed = TRUE;
14672                             break;
14673                         }
14674
14675 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14676    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14677                                       || UNICODE_DOT_DOT_VERSION > 0)
14678
14679                         /* On non-ancient Unicodes, check for the only possible
14680                          * multi-char fold  */
14681                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14682
14683                             /* This potential multi-char fold means the node
14684                              * can't be simple (because it could match more
14685                              * than a single char).  And in some cases it will
14686                              * match 'ss', so set that flag */
14687                             maybe_SIMPLE = 0;
14688                             has_ss = TRUE;
14689
14690                             /* It can't change to be an EXACTFU (unless already
14691                              * is one).  We fold it iff under /u rules. */
14692                             if (node_type != EXACTFU) {
14693                                 maybe_exactfu = FALSE;
14694                             }
14695                             else {
14696                                 if (UNLIKELY(len + 2 > max_string_len)) {
14697                                     overflowed = TRUE;
14698                                     break;
14699                                 }
14700
14701                                 *(s++) = 's';
14702                                 *(s++) = 's';
14703                                 added_len = 2;
14704
14705                                 goto done_with_this_char;
14706                             }
14707                         }
14708                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14709                                  && LIKELY(len > 0)
14710                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14711                         {
14712                             /* Also, the sequence 'ss' is special when not
14713                              * under /u.  If the target string is UTF-8, it
14714                              * should match SHARP S; otherwise it won't.  So,
14715                              * here we have to exclude the possibility of this
14716                              * node moving to /u.*/
14717                             has_ss = TRUE;
14718                             maybe_exactfu = FALSE;
14719                         }
14720 #endif
14721                         /* Here, the fold will be a single character */
14722
14723                         if (UNLIKELY(ender == MICRO_SIGN)) {
14724                             has_micro_sign = TRUE;
14725                         }
14726                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14727
14728                             /* If the character's fold differs between /d and
14729                              * /u, this can't change to be an EXACTFU node */
14730                             maybe_exactfu = FALSE;
14731                         }
14732
14733                         *(s++) = (DEPENDS_SEMANTICS)
14734                                  ? (char) toFOLD(ender)
14735
14736                                    /* Under /u, the fold of any character in
14737                                     * the 0-255 range happens to be its
14738                                     * lowercase equivalent, except for LATIN
14739                                     * SMALL LETTER SHARP S, which was handled
14740                                     * above, and the MICRO SIGN, whose fold
14741                                     * requires UTF-8 to represent.  */
14742                                  : (char) toLOWER_L1(ender);
14743                     }
14744                 } /* End of adding current character to the node */
14745
14746               done_with_this_char:
14747
14748                 len += added_len;
14749
14750                 if (next_is_quantifier) {
14751
14752                     /* Here, the next input is a quantifier, and to get here,
14753                      * the current character is the only one in the node. */
14754                     goto loopdone;
14755                 }
14756
14757             } /* End of loop through literal characters */
14758
14759             /* Here we have either exhausted the input or run out of room in
14760              * the node.  If the former, we are done.  (If we encountered a
14761              * character that can't be in the node, transfer is made directly
14762              * to <loopdone>, and so we wouldn't have fallen off the end of the
14763              * loop.)  */
14764             if (LIKELY(! overflowed)) {
14765                 goto loopdone;
14766             }
14767
14768             /* Here we have run out of room.  We can grow plain EXACT and
14769              * LEXACT nodes.  If the pattern is gigantic enough, though,
14770              * eventually we'll have to artificially chunk the pattern into
14771              * multiple nodes. */
14772             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14773                 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14774                 Size_t overhead_expansion = 0;
14775                 char temp[256];
14776                 Size_t max_nodes_for_string;
14777                 Size_t achievable;
14778                 SSize_t delta;
14779
14780                 /* Here we couldn't fit the final character in the current
14781                  * node, so it will have to be reparsed, no matter what else we
14782                  * do */
14783                 p = oldp;
14784
14785                 /* If would have overflowed a regular EXACT node, switch
14786                  * instead to an LEXACT.  The code below is structured so that
14787                  * the actual growing code is common to changing from an EXACT
14788                  * or just increasing the LEXACT size.  This means that we have
14789                  * to save the string in the EXACT case before growing, and
14790                  * then copy it afterwards to its new location */
14791                 if (node_type == EXACT) {
14792                     overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14793                     RExC_emit += overhead_expansion;
14794                     Copy(s0, temp, len, char);
14795                 }
14796
14797                 /* Ready to grow.  If it was a plain EXACT, the string was
14798                  * saved, and the first few bytes of it overwritten by adding
14799                  * an argument field.  We assume, as we do elsewhere in this
14800                  * file, that one byte of remaining input will translate into
14801                  * one byte of output, and if that's too small, we grow again,
14802                  * if too large the excess memory is freed at the end */
14803
14804                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14805                 achievable = MIN(max_nodes_for_string,
14806                                  current_string_nodes + STR_SZ(RExC_end - p));
14807                 delta = achievable - current_string_nodes;
14808
14809                 /* If there is just no more room, go finish up this chunk of
14810                  * the pattern. */
14811                 if (delta <= 0) {
14812                     goto loopdone;
14813                 }
14814
14815                 change_engine_size(pRExC_state, delta + overhead_expansion);
14816                 current_string_nodes += delta;
14817                 max_string_len
14818                            = sizeof(struct regnode) * current_string_nodes;
14819                 upper_fill = max_string_len + 1;
14820
14821                 /* If the length was small, we know this was originally an
14822                  * EXACT node now converted to LEXACT, and the string has to be
14823                  * restored.  Otherwise the string was untouched.  260 is just
14824                  * a number safely above 255 so don't have to worry about
14825                  * getting it precise */
14826                 if (len < 260) {
14827                     node_type = LEXACT;
14828                     FILL_NODE(ret, node_type);
14829                     s0 = STRING(REGNODE_p(ret));
14830                     Copy(temp, s0, len, char);
14831                     s = s0 + len;
14832                 }
14833
14834                 goto continue_parse;
14835             }
14836             else if (FOLD) {
14837                 bool splittable = FALSE;
14838                 bool backed_up = FALSE;
14839                 char * e;       /* should this be U8? */
14840                 char * s_start; /* should this be U8? */
14841
14842                 /* Here is /i.  Running out of room creates a problem if we are
14843                  * folding, and the split happens in the middle of a
14844                  * multi-character fold, as a match that should have occurred,
14845                  * won't, due to the way nodes are matched, and our artificial
14846                  * boundary.  So back off until we aren't splitting such a
14847                  * fold.  If there is no such place to back off to, we end up
14848                  * taking the entire node as-is.  This can happen if the node
14849                  * consists entirely of 'f' or entirely of 's' characters (or
14850                  * things that fold to them) as 'ff' and 'ss' are
14851                  * multi-character folds.
14852                  *
14853                  * The Unicode standard says that multi character folds consist
14854                  * of either two or three characters.  That means we would be
14855                  * splitting one if the final character in the node is at the
14856                  * beginning of either type, or is the second of a three
14857                  * character fold.
14858                  *
14859                  * At this point:
14860                  *  ender     is the code point of the character that won't fit
14861                  *            in the node
14862                  *  s         points to just beyond the final byte in the node.
14863                  *            It's where we would place ender if there were
14864                  *            room, and where in fact we do place ender's fold
14865                  *            in the code below, as we've over-allocated space
14866                  *            for s0 (hence s) to allow for this
14867                  *  e         starts at 's' and advances as we append things.
14868                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
14869                  *            have been advanced to beyond it).
14870                  *  old_old_s points to the beginning byte of the final
14871                  *            character in the node
14872                  *  p         points to the beginning byte in the input of the
14873                  *            character beyond 'ender'.
14874                  *  oldp      points to the beginning byte in the input of
14875                  *            'ender'.
14876                  *
14877                  * In the case of /il, we haven't folded anything that could be
14878                  * affected by the locale.  That means only above-Latin1
14879                  * characters that fold to other above-latin1 characters get
14880                  * folded at compile time.  To check where a good place to
14881                  * split nodes is, everything in it will have to be folded.
14882                  * The boolean 'maybe_exactfu' keeps track in /il if there are
14883                  * any unfolded characters in the node. */
14884                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14885
14886                 /* If we do need to fold the node, we need a place to store the
14887                  * folded copy, and a way to map back to the unfolded original
14888                  * */
14889                 char * locfold_buf = NULL;
14890                 Size_t * loc_correspondence = NULL;
14891
14892                 if (! need_to_fold_loc) {   /* The normal case.  Just
14893                                                initialize to the actual node */
14894                     e = s;
14895                     s_start = s0;
14896                     s = old_old_s;  /* Point to the beginning of the final char
14897                                        that fits in the node */
14898                 }
14899                 else {
14900
14901                     /* Here, we have filled a /il node, and there are unfolded
14902                      * characters in it.  If the runtime locale turns out to be
14903                      * UTF-8, there are possible multi-character folds, just
14904                      * like when not under /l.  The node hence can't terminate
14905                      * in the middle of such a fold.  To determine this, we
14906                      * have to create a folded copy of this node.  That means
14907                      * reparsing the node, folding everything assuming a UTF-8
14908                      * locale.  (If at runtime it isn't such a locale, the
14909                      * actions here wouldn't have been necessary, but we have
14910                      * to assume the worst case.)  If we find we need to back
14911                      * off the folded string, we do so, and then map that
14912                      * position back to the original unfolded node, which then
14913                      * gets output, truncated at that spot */
14914
14915                     char * redo_p = RExC_parse;
14916                     char * redo_e;
14917                     char * old_redo_e;
14918
14919                     /* Allow enough space assuming a single byte input folds to
14920                      * a single byte output, plus assume that the two unparsed
14921                      * characters (that we may need) fold to the largest number
14922                      * of bytes possible, plus extra for one more worst case
14923                      * scenario.  In the loop below, if we start eating into
14924                      * that final spare space, we enlarge this initial space */
14925                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14926
14927                     Newxz(locfold_buf, size, char);
14928                     Newxz(loc_correspondence, size, Size_t);
14929
14930                     /* Redo this node's parse, folding into 'locfold_buf' */
14931                     redo_p = RExC_parse;
14932                     old_redo_e = redo_e = locfold_buf;
14933                     while (redo_p <= oldp) {
14934
14935                         old_redo_e = redo_e;
14936                         loc_correspondence[redo_e - locfold_buf]
14937                                                         = redo_p - RExC_parse;
14938
14939                         if (UTF) {
14940                             Size_t added_len;
14941
14942                             (void) _to_utf8_fold_flags((U8 *) redo_p,
14943                                                        (U8 *) RExC_end,
14944                                                        (U8 *) redo_e,
14945                                                        &added_len,
14946                                                        FOLD_FLAGS_FULL);
14947                             redo_e += added_len;
14948                             redo_p += UTF8SKIP(redo_p);
14949                         }
14950                         else {
14951
14952                             /* Note that if this code is run on some ancient
14953                              * Unicode versions, SHARP S doesn't fold to 'ss',
14954                              * but rather than clutter the code with #ifdef's,
14955                              * as is done above, we ignore that possibility.
14956                              * This is ok because this code doesn't affect what
14957                              * gets matched, but merely where the node gets
14958                              * split */
14959                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14960                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14961                             }
14962                             else {
14963                                 *redo_e++ = 's';
14964                                 *redo_e++ = 's';
14965                             }
14966                             redo_p++;
14967                         }
14968
14969
14970                         /* If we're getting so close to the end that a
14971                          * worst-case fold in the next character would cause us
14972                          * to overflow, increase, assuming one byte output byte
14973                          * per one byte input one, plus room for another worst
14974                          * case fold */
14975                         if (   redo_p <= oldp
14976                             && redo_e > locfold_buf + size
14977                                                     - (UTF8_MAXBYTES_CASE + 1))
14978                         {
14979                             Size_t new_size = size
14980                                             + (oldp - redo_p)
14981                                             + UTF8_MAXBYTES_CASE + 1;
14982                             Ptrdiff_t e_offset = redo_e - locfold_buf;
14983
14984                             Renew(locfold_buf, new_size, char);
14985                             Renew(loc_correspondence, new_size, Size_t);
14986                             size = new_size;
14987
14988                             redo_e = locfold_buf + e_offset;
14989                         }
14990                     }
14991
14992                     /* Set so that things are in terms of the folded, temporary
14993                      * string */
14994                     s = old_redo_e;
14995                     s_start = locfold_buf;
14996                     e = redo_e;
14997
14998                 }
14999
15000                 /* Here, we have 's', 's_start' and 'e' set up to point to the
15001                  * input that goes into the node, folded.
15002                  *
15003                  * If the final character of the node and the fold of ender
15004                  * form the first two characters of a three character fold, we
15005                  * need to peek ahead at the next (unparsed) character in the
15006                  * input to determine if the three actually do form such a
15007                  * fold.  Just looking at that character is not generally
15008                  * sufficient, as it could be, for example, an escape sequence
15009                  * that evaluates to something else, and it needs to be folded.
15010                  *
15011                  * khw originally thought to just go through the parse loop one
15012                  * extra time, but that doesn't work easily as that iteration
15013                  * could cause things to think that the parse is over and to
15014                  * goto loopdone.  The character could be a '$' for example, or
15015                  * the character beyond could be a quantifier, and other
15016                  * glitches as well.
15017                  *
15018                  * The solution used here for peeking ahead is to look at that
15019                  * next character.  If it isn't ASCII punctuation, then it will
15020                  * be something that would continue on in an EXACTish node if
15021                  * there were space.  We append the fold of it to s, having
15022                  * reserved enough room in s0 for the purpose.  If we can't
15023                  * reasonably peek ahead, we instead assume the worst case:
15024                  * that it is something that would form the completion of a
15025                  * multi-char fold.
15026                  *
15027                  * If we can't split between s and ender, we work backwards
15028                  * character-by-character down to s0.  At each current point
15029                  * see if we are at the beginning of a multi-char fold.  If so,
15030                  * that means we would be splitting the fold across nodes, and
15031                  * so we back up one and try again.
15032                  *
15033                  * If we're not at the beginning, we still could be at the
15034                  * final two characters of a (rare) three character fold.  We
15035                  * check if the sequence starting at the character before the
15036                  * current position (and including the current and next
15037                  * characters) is a three character fold.  If not, the node can
15038                  * be split here.  If it is, we have to backup two characters
15039                  * and try again.
15040                  *
15041                  * Otherwise, the node can be split at the current position.
15042                  *
15043                  * The same logic is used for UTF-8 patterns and not */
15044                 if (UTF) {
15045                     Size_t added_len;
15046
15047                     /* Append the fold of ender */
15048                     (void) _to_uni_fold_flags(
15049                         ender,
15050                         (U8 *) e,
15051                         &added_len,
15052                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15053                                         ? FOLD_FLAGS_NOMIX_ASCII
15054                                         : 0));
15055                     e += added_len;
15056
15057                     /* 's' and the character folded to by ender may be the
15058                      * first two of a three-character fold, in which case the
15059                      * node should not be split here.  That may mean examining
15060                      * the so-far unparsed character starting at 'p'.  But if
15061                      * ender folded to more than one character, we already have
15062                      * three characters to look at.  Also, we first check if
15063                      * the sequence consisting of s and the next character form
15064                      * the first two of some three character fold.  If not,
15065                      * there's no need to peek ahead. */
15066                     if (   added_len <= UTF8SKIP(e - added_len)
15067                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15068                     {
15069                         /* Here, the two do form the beginning of a potential
15070                          * three character fold.  The unexamined character may
15071                          * or may not complete it.  Peek at it.  It might be
15072                          * something that ends the node or an escape sequence,
15073                          * in which case we don't know without a lot of work
15074                          * what it evaluates to, so we have to assume the worst
15075                          * case: that it does complete the fold, and so we
15076                          * can't split here.  All such instances  will have
15077                          * that character be an ASCII punctuation character,
15078                          * like a backslash.  So, for that case, backup one and
15079                          * drop down to try at that position */
15080                         if (isPUNCT(*p)) {
15081                             s = (char *) utf8_hop_back((U8 *) s, -1,
15082                                        (U8 *) s_start);
15083                             backed_up = TRUE;
15084                         }
15085                         else {
15086                             /* Here, since it's not punctuation, it must be a
15087                              * real character, and we can append its fold to
15088                              * 'e' (having deliberately reserved enough space
15089                              * for this eventuality) and drop down to check if
15090                              * the three actually do form a folded sequence */
15091                             (void) _to_utf8_fold_flags(
15092                                 (U8 *) p, (U8 *) RExC_end,
15093                                 (U8 *) e,
15094                                 &added_len,
15095                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15096                                                 ? FOLD_FLAGS_NOMIX_ASCII
15097                                                 : 0));
15098                             e += added_len;
15099                         }
15100                     }
15101
15102                     /* Here, we either have three characters available in
15103                      * sequence starting at 's', or we have two characters and
15104                      * know that the following one can't possibly be part of a
15105                      * three character fold.  We go through the node backwards
15106                      * until we find a place where we can split it without
15107                      * breaking apart a multi-character fold.  At any given
15108                      * point we have to worry about if such a fold begins at
15109                      * the current 's', and also if a three-character fold
15110                      * begins at s-1, (containing s and s+1).  Splitting in
15111                      * either case would break apart a fold */
15112                     do {
15113                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15114                                                             (U8 *) s_start);
15115
15116                         /* If is a multi-char fold, can't split here.  Backup
15117                          * one char and try again */
15118                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15119                             s = prev_s;
15120                             backed_up = TRUE;
15121                             continue;
15122                         }
15123
15124                         /* If the two characters beginning at 's' are part of a
15125                          * three character fold starting at the character
15126                          * before s, we can't split either before or after s.
15127                          * Backup two chars and try again */
15128                         if (   LIKELY(s > s_start)
15129                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15130                         {
15131                             s = prev_s;
15132                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15133                             backed_up = TRUE;
15134                             continue;
15135                         }
15136
15137                         /* Here there's no multi-char fold between s and the
15138                          * next character following it.  We can split */
15139                         splittable = TRUE;
15140                         break;
15141
15142                     } while (s > s_start); /* End of loops backing up through the node */
15143
15144                     /* Here we either couldn't find a place to split the node,
15145                      * or else we broke out of the loop setting 'splittable' to
15146                      * true.  In the latter case, the place to split is between
15147                      * the first and second characters in the sequence starting
15148                      * at 's' */
15149                     if (splittable) {
15150                         s += UTF8SKIP(s);
15151                     }
15152                 }
15153                 else {  /* Pattern not UTF-8 */
15154                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
15155                         || ASCII_FOLD_RESTRICTED)
15156                     {
15157                         assert( toLOWER_L1(ender) < 256 );
15158                         *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15159                     }
15160                     else {
15161                         *e++ = 's';
15162                         *e++ = 's';
15163                     }
15164
15165                     if (   e - s  <= 1
15166                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15167                     {
15168                         if (isPUNCT(*p)) {
15169                             s--;
15170                             backed_up = TRUE;
15171                         }
15172                         else {
15173                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15174                                 || ASCII_FOLD_RESTRICTED)
15175                             {
15176                                 assert( toLOWER_L1(ender) < 256 );
15177                                 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15178                             }
15179                             else {
15180                                 *e++ = 's';
15181                                 *e++ = 's';
15182                             }
15183                         }
15184                     }
15185
15186                     do {
15187                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15188                             s--;
15189                             backed_up = TRUE;
15190                             continue;
15191                         }
15192
15193                         if (   LIKELY(s > s_start)
15194                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15195                         {
15196                             s -= 2;
15197                             backed_up = TRUE;
15198                             continue;
15199                         }
15200
15201                         splittable = TRUE;
15202                         break;
15203
15204                     } while (s > s_start);
15205
15206                     if (splittable) {
15207                         s++;
15208                     }
15209                 }
15210
15211                 /* Here, we are done backing up.  If we didn't backup at all
15212                  * (the likely case), just proceed */
15213                 if (backed_up) {
15214
15215                    /* If we did find a place to split, reparse the entire node
15216                     * stopping where we have calculated. */
15217                     if (splittable) {
15218
15219                        /* If we created a temporary folded string under /l, we
15220                         * have to map that back to the original */
15221                         if (need_to_fold_loc) {
15222                             upper_fill = loc_correspondence[s - s_start];
15223                             if (upper_fill == 0) {
15224                                 FAIL2("panic: loc_correspondence[%d] is 0",
15225                                       (int) (s - s_start));
15226                             }
15227                             Safefree(locfold_buf);
15228                             Safefree(loc_correspondence);
15229                         }
15230                         else {
15231                             upper_fill = s - s0;
15232                         }
15233                         goto reparse;
15234                     }
15235
15236                     /* Here the node consists entirely of non-final multi-char
15237                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
15238                      * decent place to split it, so give up and just take the
15239                      * whole thing */
15240                     len = old_s - s0;
15241                 }
15242
15243                 if (need_to_fold_loc) {
15244                     Safefree(locfold_buf);
15245                     Safefree(loc_correspondence);
15246                 }
15247             }   /* End of verifying node ends with an appropriate char */
15248
15249             /* We need to start the next node at the character that didn't fit
15250              * in this one */
15251             p = oldp;
15252
15253           loopdone:   /* Jumped to when encounters something that shouldn't be
15254                          in the node */
15255
15256             /* Free up any over-allocated space; cast is to silence bogus
15257              * warning in MS VC */
15258             change_engine_size(pRExC_state,
15259                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15260
15261             /* I (khw) don't know if you can get here with zero length, but the
15262              * old code handled this situation by creating a zero-length EXACT
15263              * node.  Might as well be NOTHING instead */
15264             if (len == 0) {
15265                 OP(REGNODE_p(ret)) = NOTHING;
15266             }
15267             else {
15268
15269                 /* If the node type is EXACT here, check to see if it
15270                  * should be EXACTL, or EXACT_REQ8. */
15271                 if (node_type == EXACT) {
15272                     if (LOC) {
15273                         node_type = EXACTL;
15274                     }
15275                     else if (requires_utf8_target) {
15276                         node_type = EXACT_REQ8;
15277                     }
15278                 }
15279                 else if (node_type == LEXACT) {
15280                     if (requires_utf8_target) {
15281                         node_type = LEXACT_REQ8;
15282                     }
15283                 }
15284                 else if (FOLD) {
15285                     if (    UNLIKELY(has_micro_sign || has_ss)
15286                         && (node_type == EXACTFU || (   node_type == EXACTF
15287                                                      && maybe_exactfu)))
15288                     {   /* These two conditions are problematic in non-UTF-8
15289                            EXACTFU nodes. */
15290                         assert(! UTF);
15291                         node_type = EXACTFUP;
15292                     }
15293                     else if (node_type == EXACTFL) {
15294
15295                         /* 'maybe_exactfu' is deliberately set above to
15296                          * indicate this node type, where all code points in it
15297                          * are above 255 */
15298                         if (maybe_exactfu) {
15299                             node_type = EXACTFLU8;
15300                         }
15301                         else if (UNLIKELY(
15302                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15303                         {
15304                             /* A character that folds to more than one will
15305                              * match multiple characters, so can't be SIMPLE.
15306                              * We don't have to worry about this with EXACTFLU8
15307                              * nodes just above, as they have already been
15308                              * folded (since the fold doesn't vary at run
15309                              * time).  Here, if the final character in the node
15310                              * folds to multiple, it can't be simple.  (This
15311                              * only has an effect if the node has only a single
15312                              * character, hence the final one, as elsewhere we
15313                              * turn off simple for nodes whose length > 1 */
15314                             maybe_SIMPLE = 0;
15315                         }
15316                     }
15317                     else if (node_type == EXACTF) {  /* Means is /di */
15318
15319                         /* This intermediate variable is needed solely because
15320                          * the asserts in the macro where used exceed Win32's
15321                          * literal string capacity */
15322                         char first_char = * STRING(REGNODE_p(ret));
15323
15324                         /* If 'maybe_exactfu' is clear, then we need to stay
15325                          * /di.  If it is set, it means there are no code
15326                          * points that match differently depending on UTF8ness
15327                          * of the target string, so it can become an EXACTFU
15328                          * node */
15329                         if (! maybe_exactfu) {
15330                             RExC_seen_d_op = TRUE;
15331                         }
15332                         else if (   isALPHA_FOLD_EQ(first_char, 's')
15333                                  || isALPHA_FOLD_EQ(ender, 's'))
15334                         {
15335                             /* But, if the node begins or ends in an 's' we
15336                              * have to defer changing it into an EXACTFU, as
15337                              * the node could later get joined with another one
15338                              * that ends or begins with 's' creating an 'ss'
15339                              * sequence which would then wrongly match the
15340                              * sharp s without the target being UTF-8.  We
15341                              * create a special node that we resolve later when
15342                              * we join nodes together */
15343
15344                             node_type = EXACTFU_S_EDGE;
15345                         }
15346                         else {
15347                             node_type = EXACTFU;
15348                         }
15349                     }
15350
15351                     if (requires_utf8_target && node_type == EXACTFU) {
15352                         node_type = EXACTFU_REQ8;
15353                     }
15354                 }
15355
15356                 OP(REGNODE_p(ret)) = node_type;
15357                 setSTR_LEN(REGNODE_p(ret), len);
15358                 RExC_emit += STR_SZ(len);
15359
15360                 /* If the node isn't a single character, it can't be SIMPLE */
15361                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15362                     maybe_SIMPLE = 0;
15363                 }
15364
15365                 *flagp |= HASWIDTH | maybe_SIMPLE;
15366             }
15367
15368             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15369             RExC_parse = p;
15370
15371             {
15372                 /* len is STRLEN which is unsigned, need to copy to signed */
15373                 IV iv = len;
15374                 if (iv < 0)
15375                     vFAIL("Internal disaster");
15376             }
15377
15378         } /* End of label 'defchar:' */
15379         break;
15380     } /* End of giant switch on input character */
15381
15382     /* Position parse to next real character */
15383     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15384                                             FALSE /* Don't force to /x */ );
15385     if (   *RExC_parse == '{'
15386         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15387     {
15388         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15389             RExC_parse++;
15390             vFAIL("Unescaped left brace in regex is illegal here");
15391         }
15392         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15393                                   " passed through");
15394     }
15395
15396     return(ret);
15397 }
15398
15399
15400 STATIC void
15401 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15402 {
15403     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
15404      * sets up the bitmap and any flags, removing those code points from the
15405      * inversion list, setting it to NULL should it become completely empty */
15406
15407
15408     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15409     assert(PL_regkind[OP(node)] == ANYOF);
15410
15411     /* There is no bitmap for this node type */
15412     if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15413         return;
15414     }
15415
15416     ANYOF_BITMAP_ZERO(node);
15417     if (*invlist_ptr) {
15418
15419         /* This gets set if we actually need to modify things */
15420         bool change_invlist = FALSE;
15421
15422         UV start, end;
15423
15424         /* Start looking through *invlist_ptr */
15425         invlist_iterinit(*invlist_ptr);
15426         while (invlist_iternext(*invlist_ptr, &start, &end)) {
15427             UV high;
15428             int i;
15429
15430             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15431                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15432             }
15433
15434             /* Quit if are above what we should change */
15435             if (start >= NUM_ANYOF_CODE_POINTS) {
15436                 break;
15437             }
15438
15439             change_invlist = TRUE;
15440
15441             /* Set all the bits in the range, up to the max that we are doing */
15442             high = (end < NUM_ANYOF_CODE_POINTS - 1)
15443                    ? end
15444                    : NUM_ANYOF_CODE_POINTS - 1;
15445             for (i = start; i <= (int) high; i++) {
15446                 ANYOF_BITMAP_SET(node, i);
15447             }
15448         }
15449         invlist_iterfinish(*invlist_ptr);
15450
15451         /* Done with loop; remove any code points that are in the bitmap from
15452          * *invlist_ptr; similarly for code points above the bitmap if we have
15453          * a flag to match all of them anyways */
15454         if (change_invlist) {
15455             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15456         }
15457         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15458             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15459         }
15460
15461         /* If have completely emptied it, remove it completely */
15462         if (_invlist_len(*invlist_ptr) == 0) {
15463             SvREFCNT_dec_NN(*invlist_ptr);
15464             *invlist_ptr = NULL;
15465         }
15466     }
15467 }
15468
15469 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15470    Character classes ([:foo:]) can also be negated ([:^foo:]).
15471    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15472    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15473    but trigger failures because they are currently unimplemented. */
15474
15475 #define POSIXCC_DONE(c)   ((c) == ':')
15476 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15477 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15478 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15479
15480 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
15481 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
15482 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
15483
15484 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15485
15486 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15487  * routine. q.v. */
15488 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
15489         if (posix_warnings) {                                               \
15490             if (! RExC_warn_text ) RExC_warn_text =                         \
15491                                          (AV *) sv_2mortal((SV *) newAV()); \
15492             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
15493                                              WARNING_PREFIX                 \
15494                                              text                           \
15495                                              REPORT_LOCATION,               \
15496                                              REPORT_LOCATION_ARGS(p)));     \
15497         }                                                                   \
15498     } STMT_END
15499 #define CLEAR_POSIX_WARNINGS()                                              \
15500     STMT_START {                                                            \
15501         if (posix_warnings && RExC_warn_text)                               \
15502             av_clear(RExC_warn_text);                                       \
15503     } STMT_END
15504
15505 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15506     STMT_START {                                                            \
15507         CLEAR_POSIX_WARNINGS();                                             \
15508         return ret;                                                         \
15509     } STMT_END
15510
15511 STATIC int
15512 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15513
15514     const char * const s,      /* Where the putative posix class begins.
15515                                   Normally, this is one past the '['.  This
15516                                   parameter exists so it can be somewhere
15517                                   besides RExC_parse. */
15518     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15519                                   NULL */
15520     AV ** posix_warnings,      /* Where to place any generated warnings, or
15521                                   NULL */
15522     const bool check_only      /* Don't die if error */
15523 )
15524 {
15525     /* This parses what the caller thinks may be one of the three POSIX
15526      * constructs:
15527      *  1) a character class, like [:blank:]
15528      *  2) a collating symbol, like [. .]
15529      *  3) an equivalence class, like [= =]
15530      * In the latter two cases, it croaks if it finds a syntactically legal
15531      * one, as these are not handled by Perl.
15532      *
15533      * The main purpose is to look for a POSIX character class.  It returns:
15534      *  a) the class number
15535      *      if it is a completely syntactically and semantically legal class.
15536      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15537      *      closing ']' of the class
15538      *  b) OOB_NAMEDCLASS
15539      *      if it appears that one of the three POSIX constructs was meant, but
15540      *      its specification was somehow defective.  'updated_parse_ptr', if
15541      *      not NULL, is set to point to the character just after the end
15542      *      character of the class.  See below for handling of warnings.
15543      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15544      *      if it  doesn't appear that a POSIX construct was intended.
15545      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15546      *      raised.
15547      *
15548      * In b) there may be errors or warnings generated.  If 'check_only' is
15549      * TRUE, then any errors are discarded.  Warnings are returned to the
15550      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15551      * instead it is NULL, warnings are suppressed.
15552      *
15553      * The reason for this function, and its complexity is that a bracketed
15554      * character class can contain just about anything.  But it's easy to
15555      * mistype the very specific posix class syntax but yielding a valid
15556      * regular bracketed class, so it silently gets compiled into something
15557      * quite unintended.
15558      *
15559      * The solution adopted here maintains backward compatibility except that
15560      * it adds a warning if it looks like a posix class was intended but
15561      * improperly specified.  The warning is not raised unless what is input
15562      * very closely resembles one of the 14 legal posix classes.  To do this,
15563      * it uses fuzzy parsing.  It calculates how many single-character edits it
15564      * would take to transform what was input into a legal posix class.  Only
15565      * if that number is quite small does it think that the intention was a
15566      * posix class.  Obviously these are heuristics, and there will be cases
15567      * where it errs on one side or another, and they can be tweaked as
15568      * experience informs.
15569      *
15570      * The syntax for a legal posix class is:
15571      *
15572      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15573      *
15574      * What this routine considers syntactically to be an intended posix class
15575      * is this (the comments indicate some restrictions that the pattern
15576      * doesn't show):
15577      *
15578      *  qr/(?x: \[?                         # The left bracket, possibly
15579      *                                      # omitted
15580      *          \h*                         # possibly followed by blanks
15581      *          (?: \^ \h* )?               # possibly a misplaced caret
15582      *          [:;]?                       # The opening class character,
15583      *                                      # possibly omitted.  A typo
15584      *                                      # semi-colon can also be used.
15585      *          \h*
15586      *          \^?                         # possibly a correctly placed
15587      *                                      # caret, but not if there was also
15588      *                                      # a misplaced one
15589      *          \h*
15590      *          .{3,15}                     # The class name.  If there are
15591      *                                      # deviations from the legal syntax,
15592      *                                      # its edit distance must be close
15593      *                                      # to a real class name in order
15594      *                                      # for it to be considered to be
15595      *                                      # an intended posix class.
15596      *          \h*
15597      *          [[:punct:]]?                # The closing class character,
15598      *                                      # possibly omitted.  If not a colon
15599      *                                      # nor semi colon, the class name
15600      *                                      # must be even closer to a valid
15601      *                                      # one
15602      *          \h*
15603      *          \]?                         # The right bracket, possibly
15604      *                                      # omitted.
15605      *     )/
15606      *
15607      * In the above, \h must be ASCII-only.
15608      *
15609      * These are heuristics, and can be tweaked as field experience dictates.
15610      * There will be cases when someone didn't intend to specify a posix class
15611      * that this warns as being so.  The goal is to minimize these, while
15612      * maximizing the catching of things intended to be a posix class that
15613      * aren't parsed as such.
15614      */
15615
15616     const char* p             = s;
15617     const char * const e      = RExC_end;
15618     unsigned complement       = 0;      /* If to complement the class */
15619     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15620     bool has_opening_bracket  = FALSE;
15621     bool has_opening_colon    = FALSE;
15622     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15623                                                    valid class */
15624     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15625     const char* name_start;             /* ptr to class name first char */
15626
15627     /* If the number of single-character typos the input name is away from a
15628      * legal name is no more than this number, it is considered to have meant
15629      * the legal name */
15630     int max_distance          = 2;
15631
15632     /* to store the name.  The size determines the maximum length before we
15633      * decide that no posix class was intended.  Should be at least
15634      * sizeof("alphanumeric") */
15635     UV input_text[15];
15636     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15637
15638     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15639
15640     CLEAR_POSIX_WARNINGS();
15641
15642     if (p >= e) {
15643         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15644     }
15645
15646     if (*(p - 1) != '[') {
15647         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15648         found_problem = TRUE;
15649     }
15650     else {
15651         has_opening_bracket = TRUE;
15652     }
15653
15654     /* They could be confused and think you can put spaces between the
15655      * components */
15656     if (isBLANK(*p)) {
15657         found_problem = TRUE;
15658
15659         do {
15660             p++;
15661         } while (p < e && isBLANK(*p));
15662
15663         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15664     }
15665
15666     /* For [. .] and [= =].  These are quite different internally from [: :],
15667      * so they are handled separately.  */
15668     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15669                                             and 1 for at least one char in it
15670                                           */
15671     {
15672         const char open_char  = *p;
15673         const char * temp_ptr = p + 1;
15674
15675         /* These two constructs are not handled by perl, and if we find a
15676          * syntactically valid one, we croak.  khw, who wrote this code, finds
15677          * this explanation of them very unclear:
15678          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15679          * And searching the rest of the internet wasn't very helpful either.
15680          * It looks like just about any byte can be in these constructs,
15681          * depending on the locale.  But unless the pattern is being compiled
15682          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15683          * In that case, it looks like [= =] isn't allowed at all, and that
15684          * [. .] could be any single code point, but for longer strings the
15685          * constituent characters would have to be the ASCII alphabetics plus
15686          * the minus-hyphen.  Any sensible locale definition would limit itself
15687          * to these.  And any portable one definitely should.  Trying to parse
15688          * the general case is a nightmare (see [perl #127604]).  So, this code
15689          * looks only for interiors of these constructs that match:
15690          *      qr/.|[-\w]{2,}/
15691          * Using \w relaxes the apparent rules a little, without adding much
15692          * danger of mistaking something else for one of these constructs.
15693          *
15694          * [. .] in some implementations described on the internet is usable to
15695          * escape a character that otherwise is special in bracketed character
15696          * classes.  For example [.].] means a literal right bracket instead of
15697          * the ending of the class
15698          *
15699          * [= =] can legitimately contain a [. .] construct, but we don't
15700          * handle this case, as that [. .] construct will later get parsed
15701          * itself and croak then.  And [= =] is checked for even when not under
15702          * /l, as Perl has long done so.
15703          *
15704          * The code below relies on there being a trailing NUL, so it doesn't
15705          * have to keep checking if the parse ptr < e.
15706          */
15707         if (temp_ptr[1] == open_char) {
15708             temp_ptr++;
15709         }
15710         else while (    temp_ptr < e
15711                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15712         {
15713             temp_ptr++;
15714         }
15715
15716         if (*temp_ptr == open_char) {
15717             temp_ptr++;
15718             if (*temp_ptr == ']') {
15719                 temp_ptr++;
15720                 if (! found_problem && ! check_only) {
15721                     RExC_parse = (char *) temp_ptr;
15722                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15723                             "extensions", open_char, open_char);
15724                 }
15725
15726                 /* Here, the syntax wasn't completely valid, or else the call
15727                  * is to check-only */
15728                 if (updated_parse_ptr) {
15729                     *updated_parse_ptr = (char *) temp_ptr;
15730                 }
15731
15732                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15733             }
15734         }
15735
15736         /* If we find something that started out to look like one of these
15737          * constructs, but isn't, we continue below so that it can be checked
15738          * for being a class name with a typo of '.' or '=' instead of a colon.
15739          * */
15740     }
15741
15742     /* Here, we think there is a possibility that a [: :] class was meant, and
15743      * we have the first real character.  It could be they think the '^' comes
15744      * first */
15745     if (*p == '^') {
15746         found_problem = TRUE;
15747         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15748         complement = 1;
15749         p++;
15750
15751         if (isBLANK(*p)) {
15752             found_problem = TRUE;
15753
15754             do {
15755                 p++;
15756             } while (p < e && isBLANK(*p));
15757
15758             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15759         }
15760     }
15761
15762     /* But the first character should be a colon, which they could have easily
15763      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15764      * distinguish from a colon, so treat that as a colon).  */
15765     if (*p == ':') {
15766         p++;
15767         has_opening_colon = TRUE;
15768     }
15769     else if (*p == ';') {
15770         found_problem = TRUE;
15771         p++;
15772         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15773         has_opening_colon = TRUE;
15774     }
15775     else {
15776         found_problem = TRUE;
15777         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15778
15779         /* Consider an initial punctuation (not one of the recognized ones) to
15780          * be a left terminator */
15781         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15782             p++;
15783         }
15784     }
15785
15786     /* They may think that you can put spaces between the components */
15787     if (isBLANK(*p)) {
15788         found_problem = TRUE;
15789
15790         do {
15791             p++;
15792         } while (p < e && isBLANK(*p));
15793
15794         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15795     }
15796
15797     if (*p == '^') {
15798
15799         /* We consider something like [^:^alnum:]] to not have been intended to
15800          * be a posix class, but XXX maybe we should */
15801         if (complement) {
15802             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15803         }
15804
15805         complement = 1;
15806         p++;
15807     }
15808
15809     /* Again, they may think that you can put spaces between the components */
15810     if (isBLANK(*p)) {
15811         found_problem = TRUE;
15812
15813         do {
15814             p++;
15815         } while (p < e && isBLANK(*p));
15816
15817         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15818     }
15819
15820     if (*p == ']') {
15821
15822         /* XXX This ']' may be a typo, and something else was meant.  But
15823          * treating it as such creates enough complications, that that
15824          * possibility isn't currently considered here.  So we assume that the
15825          * ']' is what is intended, and if we've already found an initial '[',
15826          * this leaves this construct looking like [:] or [:^], which almost
15827          * certainly weren't intended to be posix classes */
15828         if (has_opening_bracket) {
15829             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15830         }
15831
15832         /* But this function can be called when we parse the colon for
15833          * something like qr/[alpha:]]/, so we back up to look for the
15834          * beginning */
15835         p--;
15836
15837         if (*p == ';') {
15838             found_problem = TRUE;
15839             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15840         }
15841         else if (*p != ':') {
15842
15843             /* XXX We are currently very restrictive here, so this code doesn't
15844              * consider the possibility that, say, /[alpha.]]/ was intended to
15845              * be a posix class. */
15846             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15847         }
15848
15849         /* Here we have something like 'foo:]'.  There was no initial colon,
15850          * and we back up over 'foo.  XXX Unlike the going forward case, we
15851          * don't handle typos of non-word chars in the middle */
15852         has_opening_colon = FALSE;
15853         p--;
15854
15855         while (p > RExC_start && isWORDCHAR(*p)) {
15856             p--;
15857         }
15858         p++;
15859
15860         /* Here, we have positioned ourselves to where we think the first
15861          * character in the potential class is */
15862     }
15863
15864     /* Now the interior really starts.  There are certain key characters that
15865      * can end the interior, or these could just be typos.  To catch both
15866      * cases, we may have to do two passes.  In the first pass, we keep on
15867      * going unless we come to a sequence that matches
15868      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15869      * This means it takes a sequence to end the pass, so two typos in a row if
15870      * that wasn't what was intended.  If the class is perfectly formed, just
15871      * this one pass is needed.  We also stop if there are too many characters
15872      * being accumulated, but this number is deliberately set higher than any
15873      * real class.  It is set high enough so that someone who thinks that
15874      * 'alphanumeric' is a correct name would get warned that it wasn't.
15875      * While doing the pass, we keep track of where the key characters were in
15876      * it.  If we don't find an end to the class, and one of the key characters
15877      * was found, we redo the pass, but stop when we get to that character.
15878      * Thus the key character was considered a typo in the first pass, but a
15879      * terminator in the second.  If two key characters are found, we stop at
15880      * the second one in the first pass.  Again this can miss two typos, but
15881      * catches a single one
15882      *
15883      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15884      * point to the first key character.  For the second pass, it starts as -1.
15885      * */
15886
15887     name_start = p;
15888   parse_name:
15889     {
15890         bool has_blank               = FALSE;
15891         bool has_upper               = FALSE;
15892         bool has_terminating_colon   = FALSE;
15893         bool has_terminating_bracket = FALSE;
15894         bool has_semi_colon          = FALSE;
15895         unsigned int name_len        = 0;
15896         int punct_count              = 0;
15897
15898         while (p < e) {
15899
15900             /* Squeeze out blanks when looking up the class name below */
15901             if (isBLANK(*p) ) {
15902                 has_blank = TRUE;
15903                 found_problem = TRUE;
15904                 p++;
15905                 continue;
15906             }
15907
15908             /* The name will end with a punctuation */
15909             if (isPUNCT(*p)) {
15910                 const char * peek = p + 1;
15911
15912                 /* Treat any non-']' punctuation followed by a ']' (possibly
15913                  * with intervening blanks) as trying to terminate the class.
15914                  * ']]' is very likely to mean a class was intended (but
15915                  * missing the colon), but the warning message that gets
15916                  * generated shows the error position better if we exit the
15917                  * loop at the bottom (eventually), so skip it here. */
15918                 if (*p != ']') {
15919                     if (peek < e && isBLANK(*peek)) {
15920                         has_blank = TRUE;
15921                         found_problem = TRUE;
15922                         do {
15923                             peek++;
15924                         } while (peek < e && isBLANK(*peek));
15925                     }
15926
15927                     if (peek < e && *peek == ']') {
15928                         has_terminating_bracket = TRUE;
15929                         if (*p == ':') {
15930                             has_terminating_colon = TRUE;
15931                         }
15932                         else if (*p == ';') {
15933                             has_semi_colon = TRUE;
15934                             has_terminating_colon = TRUE;
15935                         }
15936                         else {
15937                             found_problem = TRUE;
15938                         }
15939                         p = peek + 1;
15940                         goto try_posix;
15941                     }
15942                 }
15943
15944                 /* Here we have punctuation we thought didn't end the class.
15945                  * Keep track of the position of the key characters that are
15946                  * more likely to have been class-enders */
15947                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15948
15949                     /* Allow just one such possible class-ender not actually
15950                      * ending the class. */
15951                     if (possible_end) {
15952                         break;
15953                     }
15954                     possible_end = p;
15955                 }
15956
15957                 /* If we have too many punctuation characters, no use in
15958                  * keeping going */
15959                 if (++punct_count > max_distance) {
15960                     break;
15961                 }
15962
15963                 /* Treat the punctuation as a typo. */
15964                 input_text[name_len++] = *p;
15965                 p++;
15966             }
15967             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15968                 input_text[name_len++] = toLOWER(*p);
15969                 has_upper = TRUE;
15970                 found_problem = TRUE;
15971                 p++;
15972             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15973                 input_text[name_len++] = *p;
15974                 p++;
15975             }
15976             else {
15977                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15978                 p+= UTF8SKIP(p);
15979             }
15980
15981             /* The declaration of 'input_text' is how long we allow a potential
15982              * class name to be, before saying they didn't mean a class name at
15983              * all */
15984             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15985                 break;
15986             }
15987         }
15988
15989         /* We get to here when the possible class name hasn't been properly
15990          * terminated before:
15991          *   1) we ran off the end of the pattern; or
15992          *   2) found two characters, each of which might have been intended to
15993          *      be the name's terminator
15994          *   3) found so many punctuation characters in the purported name,
15995          *      that the edit distance to a valid one is exceeded
15996          *   4) we decided it was more characters than anyone could have
15997          *      intended to be one. */
15998
15999         found_problem = TRUE;
16000
16001         /* In the final two cases, we know that looking up what we've
16002          * accumulated won't lead to a match, even a fuzzy one. */
16003         if (   name_len >= C_ARRAY_LENGTH(input_text)
16004             || punct_count > max_distance)
16005         {
16006             /* If there was an intermediate key character that could have been
16007              * an intended end, redo the parse, but stop there */
16008             if (possible_end && possible_end != (char *) -1) {
16009                 possible_end = (char *) -1; /* Special signal value to say
16010                                                we've done a first pass */
16011                 p = name_start;
16012                 goto parse_name;
16013             }
16014
16015             /* Otherwise, it can't have meant to have been a class */
16016             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16017         }
16018
16019         /* If we ran off the end, and the final character was a punctuation
16020          * one, back up one, to look at that final one just below.  Later, we
16021          * will restore the parse pointer if appropriate */
16022         if (name_len && p == e && isPUNCT(*(p-1))) {
16023             p--;
16024             name_len--;
16025         }
16026
16027         if (p < e && isPUNCT(*p)) {
16028             if (*p == ']') {
16029                 has_terminating_bracket = TRUE;
16030
16031                 /* If this is a 2nd ']', and the first one is just below this
16032                  * one, consider that to be the real terminator.  This gives a
16033                  * uniform and better positioning for the warning message  */
16034                 if (   possible_end
16035                     && possible_end != (char *) -1
16036                     && *possible_end == ']'
16037                     && name_len && input_text[name_len - 1] == ']')
16038                 {
16039                     name_len--;
16040                     p = possible_end;
16041
16042                     /* And this is actually equivalent to having done the 2nd
16043                      * pass now, so set it to not try again */
16044                     possible_end = (char *) -1;
16045                 }
16046             }
16047             else {
16048                 if (*p == ':') {
16049                     has_terminating_colon = TRUE;
16050                 }
16051                 else if (*p == ';') {
16052                     has_semi_colon = TRUE;
16053                     has_terminating_colon = TRUE;
16054                 }
16055                 p++;
16056             }
16057         }
16058
16059     try_posix:
16060
16061         /* Here, we have a class name to look up.  We can short circuit the
16062          * stuff below for short names that can't possibly be meant to be a
16063          * class name.  (We can do this on the first pass, as any second pass
16064          * will yield an even shorter name) */
16065         if (name_len < 3) {
16066             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16067         }
16068
16069         /* Find which class it is.  Initially switch on the length of the name.
16070          * */
16071         switch (name_len) {
16072             case 4:
16073                 if (memEQs(name_start, 4, "word")) {
16074                     /* this is not POSIX, this is the Perl \w */
16075                     class_number = ANYOF_WORDCHAR;
16076                 }
16077                 break;
16078             case 5:
16079                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16080                  *                        graph lower print punct space upper
16081                  * Offset 4 gives the best switch position.  */
16082                 switch (name_start[4]) {
16083                     case 'a':
16084                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
16085                             class_number = ANYOF_ALPHA;
16086                         break;
16087                     case 'e':
16088                         if (memBEGINs(name_start, 5, "spac")) /* space */
16089                             class_number = ANYOF_SPACE;
16090                         break;
16091                     case 'h':
16092                         if (memBEGINs(name_start, 5, "grap")) /* graph */
16093                             class_number = ANYOF_GRAPH;
16094                         break;
16095                     case 'i':
16096                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
16097                             class_number = ANYOF_ASCII;
16098                         break;
16099                     case 'k':
16100                         if (memBEGINs(name_start, 5, "blan")) /* blank */
16101                             class_number = ANYOF_BLANK;
16102                         break;
16103                     case 'l':
16104                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16105                             class_number = ANYOF_CNTRL;
16106                         break;
16107                     case 'm':
16108                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16109                             class_number = ANYOF_ALPHANUMERIC;
16110                         break;
16111                     case 'r':
16112                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
16113                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16114                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16115                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16116                         break;
16117                     case 't':
16118                         if (memBEGINs(name_start, 5, "digi")) /* digit */
16119                             class_number = ANYOF_DIGIT;
16120                         else if (memBEGINs(name_start, 5, "prin")) /* print */
16121                             class_number = ANYOF_PRINT;
16122                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
16123                             class_number = ANYOF_PUNCT;
16124                         break;
16125                 }
16126                 break;
16127             case 6:
16128                 if (memEQs(name_start, 6, "xdigit"))
16129                     class_number = ANYOF_XDIGIT;
16130                 break;
16131         }
16132
16133         /* If the name exactly matches a posix class name the class number will
16134          * here be set to it, and the input almost certainly was meant to be a
16135          * posix class, so we can skip further checking.  If instead the syntax
16136          * is exactly correct, but the name isn't one of the legal ones, we
16137          * will return that as an error below.  But if neither of these apply,
16138          * it could be that no posix class was intended at all, or that one
16139          * was, but there was a typo.  We tease these apart by doing fuzzy
16140          * matching on the name */
16141         if (class_number == OOB_NAMEDCLASS && found_problem) {
16142             const UV posix_names[][6] = {
16143                                                 { 'a', 'l', 'n', 'u', 'm' },
16144                                                 { 'a', 'l', 'p', 'h', 'a' },
16145                                                 { 'a', 's', 'c', 'i', 'i' },
16146                                                 { 'b', 'l', 'a', 'n', 'k' },
16147                                                 { 'c', 'n', 't', 'r', 'l' },
16148                                                 { 'd', 'i', 'g', 'i', 't' },
16149                                                 { 'g', 'r', 'a', 'p', 'h' },
16150                                                 { 'l', 'o', 'w', 'e', 'r' },
16151                                                 { 'p', 'r', 'i', 'n', 't' },
16152                                                 { 'p', 'u', 'n', 'c', 't' },
16153                                                 { 's', 'p', 'a', 'c', 'e' },
16154                                                 { 'u', 'p', 'p', 'e', 'r' },
16155                                                 { 'w', 'o', 'r', 'd' },
16156                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
16157                                             };
16158             /* The names of the above all have added NULs to make them the same
16159              * size, so we need to also have the real lengths */
16160             const UV posix_name_lengths[] = {
16161                                                 sizeof("alnum") - 1,
16162                                                 sizeof("alpha") - 1,
16163                                                 sizeof("ascii") - 1,
16164                                                 sizeof("blank") - 1,
16165                                                 sizeof("cntrl") - 1,
16166                                                 sizeof("digit") - 1,
16167                                                 sizeof("graph") - 1,
16168                                                 sizeof("lower") - 1,
16169                                                 sizeof("print") - 1,
16170                                                 sizeof("punct") - 1,
16171                                                 sizeof("space") - 1,
16172                                                 sizeof("upper") - 1,
16173                                                 sizeof("word")  - 1,
16174                                                 sizeof("xdigit")- 1
16175                                             };
16176             unsigned int i;
16177             int temp_max = max_distance;    /* Use a temporary, so if we
16178                                                reparse, we haven't changed the
16179                                                outer one */
16180
16181             /* Use a smaller max edit distance if we are missing one of the
16182              * delimiters */
16183             if (   has_opening_bracket + has_opening_colon < 2
16184                 || has_terminating_bracket + has_terminating_colon < 2)
16185             {
16186                 temp_max--;
16187             }
16188
16189             /* See if the input name is close to a legal one */
16190             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16191
16192                 /* Short circuit call if the lengths are too far apart to be
16193                  * able to match */
16194                 if (abs( (int) (name_len - posix_name_lengths[i]))
16195                     > temp_max)
16196                 {
16197                     continue;
16198                 }
16199
16200                 if (edit_distance(input_text,
16201                                   posix_names[i],
16202                                   name_len,
16203                                   posix_name_lengths[i],
16204                                   temp_max
16205                                  )
16206                     > -1)
16207                 { /* If it is close, it probably was intended to be a class */
16208                     goto probably_meant_to_be;
16209                 }
16210             }
16211
16212             /* Here the input name is not close enough to a valid class name
16213              * for us to consider it to be intended to be a posix class.  If
16214              * we haven't already done so, and the parse found a character that
16215              * could have been terminators for the name, but which we absorbed
16216              * as typos during the first pass, repeat the parse, signalling it
16217              * to stop at that character */
16218             if (possible_end && possible_end != (char *) -1) {
16219                 possible_end = (char *) -1;
16220                 p = name_start;
16221                 goto parse_name;
16222             }
16223
16224             /* Here neither pass found a close-enough class name */
16225             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16226         }
16227
16228     probably_meant_to_be:
16229
16230         /* Here we think that a posix specification was intended.  Update any
16231          * parse pointer */
16232         if (updated_parse_ptr) {
16233             *updated_parse_ptr = (char *) p;
16234         }
16235
16236         /* If a posix class name was intended but incorrectly specified, we
16237          * output or return the warnings */
16238         if (found_problem) {
16239
16240             /* We set flags for these issues in the parse loop above instead of
16241              * adding them to the list of warnings, because we can parse it
16242              * twice, and we only want one warning instance */
16243             if (has_upper) {
16244                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16245             }
16246             if (has_blank) {
16247                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16248             }
16249             if (has_semi_colon) {
16250                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16251             }
16252             else if (! has_terminating_colon) {
16253                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16254             }
16255             if (! has_terminating_bracket) {
16256                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16257             }
16258
16259             if (   posix_warnings
16260                 && RExC_warn_text
16261                 && av_count(RExC_warn_text) > 0)
16262             {
16263                 *posix_warnings = RExC_warn_text;
16264             }
16265         }
16266         else if (class_number != OOB_NAMEDCLASS) {
16267             /* If it is a known class, return the class.  The class number
16268              * #defines are structured so each complement is +1 to the normal
16269              * one */
16270             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16271         }
16272         else if (! check_only) {
16273
16274             /* Here, it is an unrecognized class.  This is an error (unless the
16275             * call is to check only, which we've already handled above) */
16276             const char * const complement_string = (complement)
16277                                                    ? "^"
16278                                                    : "";
16279             RExC_parse = (char *) p;
16280             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16281                         complement_string,
16282                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16283         }
16284     }
16285
16286     return OOB_NAMEDCLASS;
16287 }
16288 #undef ADD_POSIX_WARNING
16289
16290 STATIC unsigned  int
16291 S_regex_set_precedence(const U8 my_operator) {
16292
16293     /* Returns the precedence in the (?[...]) construct of the input operator,
16294      * specified by its character representation.  The precedence follows
16295      * general Perl rules, but it extends this so that ')' and ']' have (low)
16296      * precedence even though they aren't really operators */
16297
16298     switch (my_operator) {
16299         case '!':
16300             return 5;
16301         case '&':
16302             return 4;
16303         case '^':
16304         case '|':
16305         case '+':
16306         case '-':
16307             return 3;
16308         case ')':
16309             return 2;
16310         case ']':
16311             return 1;
16312     }
16313
16314     NOT_REACHED; /* NOTREACHED */
16315     return 0;   /* Silence compiler warning */
16316 }
16317
16318 STATIC regnode_offset
16319 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16320                     I32 *flagp, U32 depth,
16321                     char * const oregcomp_parse)
16322 {
16323     /* Handle the (?[...]) construct to do set operations */
16324
16325     U8 curchar;                     /* Current character being parsed */
16326     UV start, end;                  /* End points of code point ranges */
16327     SV* final = NULL;               /* The end result inversion list */
16328     SV* result_string;              /* 'final' stringified */
16329     AV* stack;                      /* stack of operators and operands not yet
16330                                        resolved */
16331     AV* fence_stack = NULL;         /* A stack containing the positions in
16332                                        'stack' of where the undealt-with left
16333                                        parens would be if they were actually
16334                                        put there */
16335     /* The 'volatile' is a workaround for an optimiser bug
16336      * in Solaris Studio 12.3. See RT #127455 */
16337     volatile IV fence = 0;          /* Position of where most recent undealt-
16338                                        with left paren in stack is; -1 if none.
16339                                      */
16340     STRLEN len;                     /* Temporary */
16341     regnode_offset node;            /* Temporary, and final regnode returned by
16342                                        this function */
16343     const bool save_fold = FOLD;    /* Temporary */
16344     char *save_end, *save_parse;    /* Temporaries */
16345     const bool in_locale = LOC;     /* we turn off /l during processing */
16346
16347     DECLARE_AND_GET_RE_DEBUG_FLAGS;
16348
16349     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16350     PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16351
16352     DEBUG_PARSE("xcls");
16353
16354     if (in_locale) {
16355         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16356     }
16357
16358     /* The use of this operator implies /u.  This is required so that the
16359      * compile time values are valid in all runtime cases */
16360     REQUIRE_UNI_RULES(flagp, 0);
16361
16362     ckWARNexperimental(RExC_parse,
16363                        WARN_EXPERIMENTAL__REGEX_SETS,
16364                        "The regex_sets feature is experimental");
16365
16366     /* Everything in this construct is a metacharacter.  Operands begin with
16367      * either a '\' (for an escape sequence), or a '[' for a bracketed
16368      * character class.  Any other character should be an operator, or
16369      * parenthesis for grouping.  Both types of operands are handled by calling
16370      * regclass() to parse them.  It is called with a parameter to indicate to
16371      * return the computed inversion list.  The parsing here is implemented via
16372      * a stack.  Each entry on the stack is a single character representing one
16373      * of the operators; or else a pointer to an operand inversion list. */
16374
16375 #define IS_OPERATOR(a) SvIOK(a)
16376 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
16377
16378     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
16379      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16380      * with pronouncing it called it Reverse Polish instead, but now that YOU
16381      * know how to pronounce it you can use the correct term, thus giving due
16382      * credit to the person who invented it, and impressing your geek friends.
16383      * Wikipedia says that the pronounciation of "Ł" has been changing so that
16384      * it is now more like an English initial W (as in wonk) than an L.)
16385      *
16386      * This means that, for example, 'a | b & c' is stored on the stack as
16387      *
16388      * c  [4]
16389      * b  [3]
16390      * &  [2]
16391      * a  [1]
16392      * |  [0]
16393      *
16394      * where the numbers in brackets give the stack [array] element number.
16395      * In this implementation, parentheses are not stored on the stack.
16396      * Instead a '(' creates a "fence" so that the part of the stack below the
16397      * fence is invisible except to the corresponding ')' (this allows us to
16398      * replace testing for parens, by using instead subtraction of the fence
16399      * position).  As new operands are processed they are pushed onto the stack
16400      * (except as noted in the next paragraph).  New operators of higher
16401      * precedence than the current final one are inserted on the stack before
16402      * the lhs operand (so that when the rhs is pushed next, everything will be
16403      * in the correct positions shown above.  When an operator of equal or
16404      * lower precedence is encountered in parsing, all the stacked operations
16405      * of equal or higher precedence are evaluated, leaving the result as the
16406      * top entry on the stack.  This makes higher precedence operations
16407      * evaluate before lower precedence ones, and causes operations of equal
16408      * precedence to left associate.
16409      *
16410      * The only unary operator '!' is immediately pushed onto the stack when
16411      * encountered.  When an operand is encountered, if the top of the stack is
16412      * a '!", the complement is immediately performed, and the '!' popped.  The
16413      * resulting value is treated as a new operand, and the logic in the
16414      * previous paragraph is executed.  Thus in the expression
16415      *      [a] + ! [b]
16416      * the stack looks like
16417      *
16418      * !
16419      * a
16420      * +
16421      *
16422      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16423      * becomes
16424      *
16425      * !b
16426      * a
16427      * +
16428      *
16429      * A ')' is treated as an operator with lower precedence than all the
16430      * aforementioned ones, which causes all operations on the stack above the
16431      * corresponding '(' to be evaluated down to a single resultant operand.
16432      * Then the fence for the '(' is removed, and the operand goes through the
16433      * algorithm above, without the fence.
16434      *
16435      * A separate stack is kept of the fence positions, so that the position of
16436      * the latest so-far unbalanced '(' is at the top of it.
16437      *
16438      * The ']' ending the construct is treated as the lowest operator of all,
16439      * so that everything gets evaluated down to a single operand, which is the
16440      * result */
16441
16442     sv_2mortal((SV *)(stack = newAV()));
16443     sv_2mortal((SV *)(fence_stack = newAV()));
16444
16445     while (RExC_parse < RExC_end) {
16446         I32 top_index;              /* Index of top-most element in 'stack' */
16447         SV** top_ptr;               /* Pointer to top 'stack' element */
16448         SV* current = NULL;         /* To contain the current inversion list
16449                                        operand */
16450         SV* only_to_avoid_leaks;
16451
16452         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16453                                 TRUE /* Force /x */ );
16454         if (RExC_parse >= RExC_end) {   /* Fail */
16455             break;
16456         }
16457
16458         curchar = UCHARAT(RExC_parse);
16459
16460 redo_curchar:
16461
16462 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16463                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16464         DEBUG_U(dump_regex_sets_structures(pRExC_state,
16465                                            stack, fence, fence_stack));
16466 #endif
16467
16468         top_index = av_tindex_skip_len_mg(stack);
16469
16470         switch (curchar) {
16471             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
16472             char stacked_operator;  /* The topmost operator on the 'stack'. */
16473             SV* lhs;                /* Operand to the left of the operator */
16474             SV* rhs;                /* Operand to the right of the operator */
16475             SV* fence_ptr;          /* Pointer to top element of the fence
16476                                        stack */
16477             case '(':
16478
16479                 if (   RExC_parse < RExC_end - 2
16480                     && UCHARAT(RExC_parse + 1) == '?'
16481                     && UCHARAT(RExC_parse + 2) == '^')
16482                 {
16483                     const regnode_offset orig_emit = RExC_emit;
16484                     SV * resultant_invlist;
16485
16486                     /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16487                      * This happens when we have some thing like
16488                      *
16489                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16490                      *   ...
16491                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
16492                      *
16493                      * Here we would be handling the interpolated
16494                      * '$thai_or_lao'.  We handle this by a recursive call to
16495                      * reg which returns the inversion list the
16496                      * interpolated expression evaluates to.  Actually, the
16497                      * return is a special regnode containing a pointer to that
16498                      * inversion list.  If the return isn't that regnode alone,
16499                      * we know that this wasn't such an interpolation, which is
16500                      * an error: we need to get a single inversion list back
16501                      * from the recursion */
16502
16503                     RExC_parse++;
16504                     RExC_sets_depth++;
16505
16506                     node = reg(pRExC_state, 2, flagp, depth+1);
16507                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16508
16509                     if (   OP(REGNODE_p(node)) != REGEX_SET
16510                            /* If more than a single node returned, the nested
16511                             * parens evaluated to more than just a (?[...]),
16512                             * which isn't legal */
16513                         || RExC_emit != orig_emit
16514                                       + NODE_STEP_REGNODE
16515                                       + regarglen[REGEX_SET])
16516                     {
16517                         vFAIL("Expecting interpolated extended charclass");
16518                     }
16519                     resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16520                     current = invlist_clone(resultant_invlist, NULL);
16521                     SvREFCNT_dec(resultant_invlist);
16522
16523                     RExC_sets_depth--;
16524                     RExC_emit = orig_emit;
16525                     goto handle_operand;
16526                 }
16527
16528                 /* A regular '('.  Look behind for illegal syntax */
16529                 if (top_index - fence >= 0) {
16530                     /* If the top entry on the stack is an operator, it had
16531                      * better be a '!', otherwise the entry below the top
16532                      * operand should be an operator */
16533                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16534                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16535                         || (   IS_OPERAND(*top_ptr)
16536                             && (   top_index - fence < 1
16537                                 || ! (stacked_ptr = av_fetch(stack,
16538                                                              top_index - 1,
16539                                                              FALSE))
16540                                 || ! IS_OPERATOR(*stacked_ptr))))
16541                     {
16542                         RExC_parse++;
16543                         vFAIL("Unexpected '(' with no preceding operator");
16544                     }
16545                 }
16546
16547                 /* Stack the position of this undealt-with left paren */
16548                 av_push(fence_stack, newSViv(fence));
16549                 fence = top_index + 1;
16550                 break;
16551
16552             case '\\':
16553                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16554                  * multi-char folds are allowed.  */
16555                 if (!regclass(pRExC_state, flagp, depth+1,
16556                               TRUE, /* means parse just the next thing */
16557                               FALSE, /* don't allow multi-char folds */
16558                               FALSE, /* don't silence non-portable warnings.  */
16559                               TRUE,  /* strict */
16560                               FALSE, /* Require return to be an ANYOF */
16561                               &current))
16562                 {
16563                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16564                     goto regclass_failed;
16565                 }
16566
16567                 assert(current);
16568
16569                 /* regclass() will return with parsing just the \ sequence,
16570                  * leaving the parse pointer at the next thing to parse */
16571                 RExC_parse--;
16572                 goto handle_operand;
16573
16574             case '[':   /* Is a bracketed character class */
16575             {
16576                 /* See if this is a [:posix:] class. */
16577                 bool is_posix_class = (OOB_NAMEDCLASS
16578                             < handle_possible_posix(pRExC_state,
16579                                                 RExC_parse + 1,
16580                                                 NULL,
16581                                                 NULL,
16582                                                 TRUE /* checking only */));
16583                 /* If it is a posix class, leave the parse pointer at the '['
16584                  * to fool regclass() into thinking it is part of a
16585                  * '[[:posix:]]'. */
16586                 if (! is_posix_class) {
16587                     RExC_parse++;
16588                 }
16589
16590                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16591                  * multi-char folds are allowed.  */
16592                 if (!regclass(pRExC_state, flagp, depth+1,
16593                                 is_posix_class, /* parse the whole char
16594                                                     class only if not a
16595                                                     posix class */
16596                                 FALSE, /* don't allow multi-char folds */
16597                                 TRUE, /* silence non-portable warnings. */
16598                                 TRUE, /* strict */
16599                                 FALSE, /* Require return to be an ANYOF */
16600                                 &current))
16601                 {
16602                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16603                     goto regclass_failed;
16604                 }
16605
16606                 assert(current);
16607
16608                 /* function call leaves parse pointing to the ']', except if we
16609                  * faked it */
16610                 if (is_posix_class) {
16611                     RExC_parse--;
16612                 }
16613
16614                 goto handle_operand;
16615             }
16616
16617             case ']':
16618                 if (top_index >= 1) {
16619                     goto join_operators;
16620                 }
16621
16622                 /* Only a single operand on the stack: are done */
16623                 goto done;
16624
16625             case ')':
16626                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16627                     if (UCHARAT(RExC_parse - 1) == ']')  {
16628                         break;
16629                     }
16630                     RExC_parse++;
16631                     vFAIL("Unexpected ')'");
16632                 }
16633
16634                 /* If nothing after the fence, is missing an operand */
16635                 if (top_index - fence < 0) {
16636                     RExC_parse++;
16637                     goto bad_syntax;
16638                 }
16639                 /* If at least two things on the stack, treat this as an
16640                   * operator */
16641                 if (top_index - fence >= 1) {
16642                     goto join_operators;
16643                 }
16644
16645                 /* Here only a single thing on the fenced stack, and there is a
16646                  * fence.  Get rid of it */
16647                 fence_ptr = av_pop(fence_stack);
16648                 assert(fence_ptr);
16649                 fence = SvIV(fence_ptr);
16650                 SvREFCNT_dec_NN(fence_ptr);
16651                 fence_ptr = NULL;
16652
16653                 if (fence < 0) {
16654                     fence = 0;
16655                 }
16656
16657                 /* Having gotten rid of the fence, we pop the operand at the
16658                  * stack top and process it as a newly encountered operand */
16659                 current = av_pop(stack);
16660                 if (IS_OPERAND(current)) {
16661                     goto handle_operand;
16662                 }
16663
16664                 RExC_parse++;
16665                 goto bad_syntax;
16666
16667             case '&':
16668             case '|':
16669             case '+':
16670             case '-':
16671             case '^':
16672
16673                 /* These binary operators should have a left operand already
16674                  * parsed */
16675                 if (   top_index - fence < 0
16676                     || top_index - fence == 1
16677                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16678                     || ! IS_OPERAND(*top_ptr))
16679                 {
16680                     goto unexpected_binary;
16681                 }
16682
16683                 /* If only the one operand is on the part of the stack visible
16684                  * to us, we just place this operator in the proper position */
16685                 if (top_index - fence < 2) {
16686
16687                     /* Place the operator before the operand */
16688
16689                     SV* lhs = av_pop(stack);
16690                     av_push(stack, newSVuv(curchar));
16691                     av_push(stack, lhs);
16692                     break;
16693                 }
16694
16695                 /* But if there is something else on the stack, we need to
16696                  * process it before this new operator if and only if the
16697                  * stacked operation has equal or higher precedence than the
16698                  * new one */
16699
16700              join_operators:
16701
16702                 /* The operator on the stack is supposed to be below both its
16703                  * operands */
16704                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16705                     || IS_OPERAND(*stacked_ptr))
16706                 {
16707                     /* But if not, it's legal and indicates we are completely
16708                      * done if and only if we're currently processing a ']',
16709                      * which should be the final thing in the expression */
16710                     if (curchar == ']') {
16711                         goto done;
16712                     }
16713
16714                   unexpected_binary:
16715                     RExC_parse++;
16716                     vFAIL2("Unexpected binary operator '%c' with no "
16717                            "preceding operand", curchar);
16718                 }
16719                 stacked_operator = (char) SvUV(*stacked_ptr);
16720
16721                 if (regex_set_precedence(curchar)
16722                     > regex_set_precedence(stacked_operator))
16723                 {
16724                     /* Here, the new operator has higher precedence than the
16725                      * stacked one.  This means we need to add the new one to
16726                      * the stack to await its rhs operand (and maybe more
16727                      * stuff).  We put it before the lhs operand, leaving
16728                      * untouched the stacked operator and everything below it
16729                      * */
16730                     lhs = av_pop(stack);
16731                     assert(IS_OPERAND(lhs));
16732
16733                     av_push(stack, newSVuv(curchar));
16734                     av_push(stack, lhs);
16735                     break;
16736                 }
16737
16738                 /* Here, the new operator has equal or lower precedence than
16739                  * what's already there.  This means the operation already
16740                  * there should be performed now, before the new one. */
16741
16742                 rhs = av_pop(stack);
16743                 if (! IS_OPERAND(rhs)) {
16744
16745                     /* This can happen when a ! is not followed by an operand,
16746                      * like in /(?[\t &!])/ */
16747                     goto bad_syntax;
16748                 }
16749
16750                 lhs = av_pop(stack);
16751
16752                 if (! IS_OPERAND(lhs)) {
16753
16754                     /* This can happen when there is an empty (), like in
16755                      * /(?[[0]+()+])/ */
16756                     goto bad_syntax;
16757                 }
16758
16759                 switch (stacked_operator) {
16760                     case '&':
16761                         _invlist_intersection(lhs, rhs, &rhs);
16762                         break;
16763
16764                     case '|':
16765                     case '+':
16766                         _invlist_union(lhs, rhs, &rhs);
16767                         break;
16768
16769                     case '-':
16770                         _invlist_subtract(lhs, rhs, &rhs);
16771                         break;
16772
16773                     case '^':   /* The union minus the intersection */
16774                     {
16775                         SV* i = NULL;
16776                         SV* u = NULL;
16777
16778                         _invlist_union(lhs, rhs, &u);
16779                         _invlist_intersection(lhs, rhs, &i);
16780                         _invlist_subtract(u, i, &rhs);
16781                         SvREFCNT_dec_NN(i);
16782                         SvREFCNT_dec_NN(u);
16783                         break;
16784                     }
16785                 }
16786                 SvREFCNT_dec(lhs);
16787
16788                 /* Here, the higher precedence operation has been done, and the
16789                  * result is in 'rhs'.  We overwrite the stacked operator with
16790                  * the result.  Then we redo this code to either push the new
16791                  * operator onto the stack or perform any higher precedence
16792                  * stacked operation */
16793                 only_to_avoid_leaks = av_pop(stack);
16794                 SvREFCNT_dec(only_to_avoid_leaks);
16795                 av_push(stack, rhs);
16796                 goto redo_curchar;
16797
16798             case '!':   /* Highest priority, right associative */
16799
16800                 /* If what's already at the top of the stack is another '!",
16801                  * they just cancel each other out */
16802                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16803                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16804                 {
16805                     only_to_avoid_leaks = av_pop(stack);
16806                     SvREFCNT_dec(only_to_avoid_leaks);
16807                 }
16808                 else { /* Otherwise, since it's right associative, just push
16809                           onto the stack */
16810                     av_push(stack, newSVuv(curchar));
16811                 }
16812                 break;
16813
16814             default:
16815                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16816                 if (RExC_parse >= RExC_end) {
16817                     break;
16818                 }
16819                 vFAIL("Unexpected character");
16820
16821           handle_operand:
16822
16823             /* Here 'current' is the operand.  If something is already on the
16824              * stack, we have to check if it is a !.  But first, the code above
16825              * may have altered the stack in the time since we earlier set
16826              * 'top_index'.  */
16827
16828             top_index = av_tindex_skip_len_mg(stack);
16829             if (top_index - fence >= 0) {
16830                 /* If the top entry on the stack is an operator, it had better
16831                  * be a '!', otherwise the entry below the top operand should
16832                  * be an operator */
16833                 top_ptr = av_fetch(stack, top_index, FALSE);
16834                 assert(top_ptr);
16835                 if (IS_OPERATOR(*top_ptr)) {
16836
16837                     /* The only permissible operator at the top of the stack is
16838                      * '!', which is applied immediately to this operand. */
16839                     curchar = (char) SvUV(*top_ptr);
16840                     if (curchar != '!') {
16841                         SvREFCNT_dec(current);
16842                         vFAIL2("Unexpected binary operator '%c' with no "
16843                                 "preceding operand", curchar);
16844                     }
16845
16846                     _invlist_invert(current);
16847
16848                     only_to_avoid_leaks = av_pop(stack);
16849                     SvREFCNT_dec(only_to_avoid_leaks);
16850
16851                     /* And we redo with the inverted operand.  This allows
16852                      * handling multiple ! in a row */
16853                     goto handle_operand;
16854                 }
16855                           /* Single operand is ok only for the non-binary ')'
16856                            * operator */
16857                 else if ((top_index - fence == 0 && curchar != ')')
16858                          || (top_index - fence > 0
16859                              && (! (stacked_ptr = av_fetch(stack,
16860                                                            top_index - 1,
16861                                                            FALSE))
16862                                  || IS_OPERAND(*stacked_ptr))))
16863                 {
16864                     SvREFCNT_dec(current);
16865                     vFAIL("Operand with no preceding operator");
16866                 }
16867             }
16868
16869             /* Here there was nothing on the stack or the top element was
16870              * another operand.  Just add this new one */
16871             av_push(stack, current);
16872
16873         } /* End of switch on next parse token */
16874
16875         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16876     } /* End of loop parsing through the construct */
16877
16878     vFAIL("Syntax error in (?[...])");
16879
16880   done:
16881
16882     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16883         if (RExC_parse < RExC_end) {
16884             RExC_parse++;
16885         }
16886
16887         vFAIL("Unexpected ']' with no following ')' in (?[...");
16888     }
16889
16890     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16891         vFAIL("Unmatched (");
16892     }
16893
16894     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16895         || ((final = av_pop(stack)) == NULL)
16896         || ! IS_OPERAND(final)
16897         || ! is_invlist(final)
16898         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16899     {
16900       bad_syntax:
16901         SvREFCNT_dec(final);
16902         vFAIL("Incomplete expression within '(?[ ])'");
16903     }
16904
16905     /* Here, 'final' is the resultant inversion list from evaluating the
16906      * expression.  Return it if so requested */
16907     if (return_invlist) {
16908         *return_invlist = final;
16909         return END;
16910     }
16911
16912     if (RExC_sets_depth) {  /* If within a recursive call, return in a special
16913                                regnode */
16914         RExC_parse++;
16915         node = regpnode(pRExC_state, REGEX_SET, final);
16916     }
16917     else {
16918
16919         /* Otherwise generate a resultant node, based on 'final'.  regclass()
16920          * is expecting a string of ranges and individual code points */
16921         invlist_iterinit(final);
16922         result_string = newSVpvs("");
16923         while (invlist_iternext(final, &start, &end)) {
16924             if (start == end) {
16925                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16926             }
16927             else {
16928                 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16929                                                         UVXf "}", start, end);
16930             }
16931         }
16932
16933         /* About to generate an ANYOF (or similar) node from the inversion list
16934          * we have calculated */
16935         save_parse = RExC_parse;
16936         RExC_parse = SvPV(result_string, len);
16937         save_end = RExC_end;
16938         RExC_end = RExC_parse + len;
16939         TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16940
16941         /* We turn off folding around the call, as the class we have
16942          * constructed already has all folding taken into consideration, and we
16943          * don't want regclass() to add to that */
16944         RExC_flags &= ~RXf_PMf_FOLD;
16945         /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16946          * folds are allowed.  */
16947         node = regclass(pRExC_state, flagp, depth+1,
16948                         FALSE, /* means parse the whole char class */
16949                         FALSE, /* don't allow multi-char folds */
16950                         TRUE, /* silence non-portable warnings.  The above may
16951                                  very well have generated non-portable code
16952                                  points, but they're valid on this machine */
16953                         FALSE, /* similarly, no need for strict */
16954
16955                         /* We can optimize into something besides an ANYOF,
16956                          * except under /l, which needs to be ANYOF because of
16957                          * runtime checks for locale sanity, etc */
16958                     ! in_locale,
16959                         NULL
16960                     );
16961
16962         RESTORE_WARNINGS;
16963         RExC_parse = save_parse + 1;
16964         RExC_end = save_end;
16965         SvREFCNT_dec_NN(final);
16966         SvREFCNT_dec_NN(result_string);
16967
16968         if (save_fold) {
16969             RExC_flags |= RXf_PMf_FOLD;
16970         }
16971
16972         if (!node) {
16973             RETURN_FAIL_ON_RESTART(*flagp, flagp);
16974             goto regclass_failed;
16975         }
16976
16977         /* Fix up the node type if we are in locale.  (We have pretended we are
16978          * under /u for the purposes of regclass(), as this construct will only
16979          * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
16980          * (so as to cause any warnings about bad locales to be output in
16981          * regexec.c), and add the flag that indicates to check if not in a
16982          * UTF-8 locale.  The reason we above forbid optimization into
16983          * something other than an ANYOF node is simply to minimize the number
16984          * of code changes in regexec.c.  Otherwise we would have to create new
16985          * EXACTish node types and deal with them.  This decision could be
16986          * revisited should this construct become popular.
16987          *
16988          * (One might think we could look at the resulting ANYOF node and
16989          * suppress the flag if everything is above 255, as those would be
16990          * UTF-8 only, but this isn't true, as the components that led to that
16991          * result could have been locale-affected, and just happen to cancel
16992          * each other out under UTF-8 locales.) */
16993         if (in_locale) {
16994             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16995
16996             assert(OP(REGNODE_p(node)) == ANYOF);
16997
16998             OP(REGNODE_p(node)) = ANYOFL;
16999             ANYOF_FLAGS(REGNODE_p(node))
17000                     |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17001         }
17002     }
17003
17004     nextchar(pRExC_state);
17005     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
17006     return node;
17007
17008   regclass_failed:
17009     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
17010                                                                 (UV) *flagp);
17011 }
17012
17013 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17014
17015 STATIC void
17016 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
17017                              AV * stack, const IV fence, AV * fence_stack)
17018 {   /* Dumps the stacks in handle_regex_sets() */
17019
17020     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
17021     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
17022     SSize_t i;
17023
17024     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
17025
17026     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
17027
17028     if (stack_top < 0) {
17029         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
17030     }
17031     else {
17032         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17033         for (i = stack_top; i >= 0; i--) {
17034             SV ** element_ptr = av_fetch(stack, i, FALSE);
17035             if (! element_ptr) {
17036             }
17037
17038             if (IS_OPERATOR(*element_ptr)) {
17039                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17040                                             (int) i, (int) SvIV(*element_ptr));
17041             }
17042             else {
17043                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17044                 sv_dump(*element_ptr);
17045             }
17046         }
17047     }
17048
17049     if (fence_stack_top < 0) {
17050         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17051     }
17052     else {
17053         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17054         for (i = fence_stack_top; i >= 0; i--) {
17055             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17056             if (! element_ptr) {
17057             }
17058
17059             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17060                                             (int) i, (int) SvIV(*element_ptr));
17061         }
17062     }
17063 }
17064
17065 #endif
17066
17067 #undef IS_OPERATOR
17068 #undef IS_OPERAND
17069
17070 STATIC void
17071 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17072 {
17073     /* This adds the Latin1/above-Latin1 folding rules.
17074      *
17075      * This should be called only for a Latin1-range code points, cp, which is
17076      * known to be involved in a simple fold with other code points above
17077      * Latin1.  It would give false results if /aa has been specified.
17078      * Multi-char folds are outside the scope of this, and must be handled
17079      * specially. */
17080
17081     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17082
17083     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17084
17085     /* The rules that are valid for all Unicode versions are hard-coded in */
17086     switch (cp) {
17087         case 'k':
17088         case 'K':
17089           *invlist =
17090              add_cp_to_invlist(*invlist, KELVIN_SIGN);
17091             break;
17092         case 's':
17093         case 'S':
17094           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17095             break;
17096         case MICRO_SIGN:
17097           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17098           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17099             break;
17100         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17101         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17102           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17103             break;
17104         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17105           *invlist = add_cp_to_invlist(*invlist,
17106                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17107             break;
17108
17109         default:    /* Other code points are checked against the data for the
17110                        current Unicode version */
17111           {
17112             Size_t folds_count;
17113             U32 first_fold;
17114             const U32 * remaining_folds;
17115             UV folded_cp;
17116
17117             if (isASCII(cp)) {
17118                 folded_cp = toFOLD(cp);
17119             }
17120             else {
17121                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17122                 Size_t dummy_len;
17123                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17124             }
17125
17126             if (folded_cp > 255) {
17127                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17128             }
17129
17130             folds_count = _inverse_folds(folded_cp, &first_fold,
17131                                                     &remaining_folds);
17132             if (folds_count == 0) {
17133
17134                 /* Use deprecated warning to increase the chances of this being
17135                  * output */
17136                 ckWARN2reg_d(RExC_parse,
17137                         "Perl folding rules are not up-to-date for 0x%02X;"
17138                         " please use the perlbug utility to report;", cp);
17139             }
17140             else {
17141                 unsigned int i;
17142
17143                 if (first_fold > 255) {
17144                     *invlist = add_cp_to_invlist(*invlist, first_fold);
17145                 }
17146                 for (i = 0; i < folds_count - 1; i++) {
17147                     if (remaining_folds[i] > 255) {
17148                         *invlist = add_cp_to_invlist(*invlist,
17149                                                     remaining_folds[i]);
17150                     }
17151                 }
17152             }
17153             break;
17154          }
17155     }
17156 }
17157
17158 STATIC void
17159 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17160 {
17161     /* Output the elements of the array given by '*posix_warnings' as REGEXP
17162      * warnings. */
17163
17164     SV * msg;
17165     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17166
17167     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17168
17169     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17170         CLEAR_POSIX_WARNINGS();
17171         return;
17172     }
17173
17174     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17175         if (first_is_fatal) {           /* Avoid leaking this */
17176             av_undef(posix_warnings);   /* This isn't necessary if the
17177                                             array is mortal, but is a
17178                                             fail-safe */
17179             (void) sv_2mortal(msg);
17180             PREPARE_TO_DIE;
17181         }
17182         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17183         SvREFCNT_dec_NN(msg);
17184     }
17185
17186     UPDATE_WARNINGS_LOC(RExC_parse);
17187 }
17188
17189 PERL_STATIC_INLINE Size_t
17190 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17191 {
17192     const U8 * const start = s1;
17193     const U8 * const send = start + max;
17194
17195     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17196
17197     while (s1 < send && *s1  == *s2) {
17198         s1++; s2++;
17199     }
17200
17201     return s1 - start;
17202 }
17203
17204
17205 STATIC AV *
17206 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17207 {
17208     /* This adds the string scalar <multi_string> to the array
17209      * <multi_char_matches>.  <multi_string> is known to have exactly
17210      * <cp_count> code points in it.  This is used when constructing a
17211      * bracketed character class and we find something that needs to match more
17212      * than a single character.
17213      *
17214      * <multi_char_matches> is actually an array of arrays.  Each top-level
17215      * element is an array that contains all the strings known so far that are
17216      * the same length.  And that length (in number of code points) is the same
17217      * as the index of the top-level array.  Hence, the [2] element is an
17218      * array, each element thereof is a string containing TWO code points;
17219      * while element [3] is for strings of THREE characters, and so on.  Since
17220      * this is for multi-char strings there can never be a [0] nor [1] element.
17221      *
17222      * When we rewrite the character class below, we will do so such that the
17223      * longest strings are written first, so that it prefers the longest
17224      * matching strings first.  This is done even if it turns out that any
17225      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
17226      * Christiansen has agreed that this is ok.  This makes the test for the
17227      * ligature 'ffi' come before the test for 'ff', for example */
17228
17229     AV* this_array;
17230     AV** this_array_ptr;
17231
17232     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17233
17234     if (! multi_char_matches) {
17235         multi_char_matches = newAV();
17236     }
17237
17238     if (av_exists(multi_char_matches, cp_count)) {
17239         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17240         this_array = *this_array_ptr;
17241     }
17242     else {
17243         this_array = newAV();
17244         av_store(multi_char_matches, cp_count,
17245                  (SV*) this_array);
17246     }
17247     av_push(this_array, multi_string);
17248
17249     return multi_char_matches;
17250 }
17251
17252 /* The names of properties whose definitions are not known at compile time are
17253  * stored in this SV, after a constant heading.  So if the length has been
17254  * changed since initialization, then there is a run-time definition. */
17255 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
17256                                         (SvCUR(listsv) != initial_listsv_len)
17257
17258 /* There is a restricted set of white space characters that are legal when
17259  * ignoring white space in a bracketed character class.  This generates the
17260  * code to skip them.
17261  *
17262  * There is a line below that uses the same white space criteria but is outside
17263  * this macro.  Both here and there must use the same definition */
17264 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
17265     STMT_START {                                                        \
17266         if (do_skip) {                                                  \
17267             while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
17268             {                                                           \
17269                 p++;                                                    \
17270             }                                                           \
17271         }                                                               \
17272     } STMT_END
17273
17274 STATIC regnode_offset
17275 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17276                  const bool stop_at_1,  /* Just parse the next thing, don't
17277                                            look for a full character class */
17278                  bool allow_mutiple_chars,
17279                  const bool silence_non_portable,   /* Don't output warnings
17280                                                        about too large
17281                                                        characters */
17282                  const bool strict,
17283                  bool optimizable,                  /* ? Allow a non-ANYOF return
17284                                                        node */
17285                  SV** ret_invlist  /* Return an inversion list, not a node */
17286           )
17287 {
17288     /* parse a bracketed class specification.  Most of these will produce an
17289      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17290      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
17291      * under /i with multi-character folds: it will be rewritten following the
17292      * paradigm of this example, where the <multi-fold>s are characters which
17293      * fold to multiple character sequences:
17294      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17295      * gets effectively rewritten as:
17296      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17297      * reg() gets called (recursively) on the rewritten version, and this
17298      * function will return what it constructs.  (Actually the <multi-fold>s
17299      * aren't physically removed from the [abcdefghi], it's just that they are
17300      * ignored in the recursion by means of a flag:
17301      * <RExC_in_multi_char_class>.)
17302      *
17303      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17304      * characters, with the corresponding bit set if that character is in the
17305      * list.  For characters above this, an inversion list is used.  There
17306      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17307      * determinable at compile time
17308      *
17309      * On success, returns the offset at which any next node should be placed
17310      * into the regex engine program being compiled.
17311      *
17312      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17313      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17314      * UTF-8
17315      */
17316
17317     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17318     IV range = 0;
17319     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17320     regnode_offset ret = -1;    /* Initialized to an illegal value */
17321     STRLEN numlen;
17322     int namedclass = OOB_NAMEDCLASS;
17323     char *rangebegin = NULL;
17324     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
17325                                aren't available at the time this was called */
17326     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17327                                       than just initialized.  */
17328     SV* properties = NULL;    /* Code points that match \p{} \P{} */
17329     SV* posixes = NULL;     /* Code points that match classes like [:word:],
17330                                extended beyond the Latin1 range.  These have to
17331                                be kept separate from other code points for much
17332                                of this function because their handling  is
17333                                different under /i, and for most classes under
17334                                /d as well */
17335     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
17336                                separate for a while from the non-complemented
17337                                versions because of complications with /d
17338                                matching */
17339     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17340                                   treated more simply than the general case,
17341                                   leading to less compilation and execution
17342                                   work */
17343     UV element_count = 0;   /* Number of distinct elements in the class.
17344                                Optimizations may be possible if this is tiny */
17345     AV * multi_char_matches = NULL; /* Code points that fold to more than one
17346                                        character; used under /i */
17347     UV n;
17348     char * stop_ptr = RExC_end;    /* where to stop parsing */
17349
17350     /* ignore unescaped whitespace? */
17351     const bool skip_white = cBOOL(   ret_invlist
17352                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17353
17354     /* inversion list of code points this node matches only when the target
17355      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
17356      * /d) */
17357     SV* upper_latin1_only_utf8_matches = NULL;
17358
17359     /* Inversion list of code points this node matches regardless of things
17360      * like locale, folding, utf8ness of the target string */
17361     SV* cp_list = NULL;
17362
17363     /* Like cp_list, but code points on this list need to be checked for things
17364      * that fold to/from them under /i */
17365     SV* cp_foldable_list = NULL;
17366
17367     /* Like cp_list, but code points on this list are valid only when the
17368      * runtime locale is UTF-8 */
17369     SV* only_utf8_locale_list = NULL;
17370
17371     /* In a range, if one of the endpoints is non-character-set portable,
17372      * meaning that it hard-codes a code point that may mean a different
17373      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17374      * mnemonic '\t' which each mean the same character no matter which
17375      * character set the platform is on. */
17376     unsigned int non_portable_endpoint = 0;
17377
17378     /* Is the range unicode? which means on a platform that isn't 1-1 native
17379      * to Unicode (i.e. non-ASCII), each code point in it should be considered
17380      * to be a Unicode value.  */
17381     bool unicode_range = FALSE;
17382     bool invert = FALSE;    /* Is this class to be complemented */
17383
17384     bool warn_super = ALWAYS_WARN_SUPER;
17385
17386     const char * orig_parse = RExC_parse;
17387
17388     /* This variable is used to mark where the end in the input is of something
17389      * that looks like a POSIX construct but isn't.  During the parse, when
17390      * something looks like it could be such a construct is encountered, it is
17391      * checked for being one, but not if we've already checked this area of the
17392      * input.  Only after this position is reached do we check again */
17393     char *not_posix_region_end = RExC_parse - 1;
17394
17395     AV* posix_warnings = NULL;
17396     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17397     U8 op = END;    /* The returned node-type, initialized to an impossible
17398                        one.  */
17399     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
17400     U32 posixl = 0;       /* bit field of posix classes matched under /l */
17401
17402
17403 /* Flags as to what things aren't knowable until runtime.  (Note that these are
17404  * mutually exclusive.) */
17405 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
17406                                             haven't been defined as of yet */
17407 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
17408                                             UTF-8 or not */
17409 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
17410                                             what gets folded */
17411     U32 has_runtime_dependency = 0;     /* OR of the above flags */
17412
17413     DECLARE_AND_GET_RE_DEBUG_FLAGS;
17414
17415     PERL_ARGS_ASSERT_REGCLASS;
17416 #ifndef DEBUGGING
17417     PERL_UNUSED_ARG(depth);
17418 #endif
17419
17420     assert(! (ret_invlist && allow_mutiple_chars));
17421
17422     /* If wants an inversion list returned, we can't optimize to something
17423      * else. */
17424     if (ret_invlist) {
17425         optimizable = FALSE;
17426     }
17427
17428     DEBUG_PARSE("clas");
17429
17430 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
17431     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
17432                                    && UNICODE_DOT_DOT_VERSION == 0)
17433     allow_mutiple_chars = FALSE;
17434 #endif
17435
17436     /* We include the /i status at the beginning of this so that we can
17437      * know it at runtime */
17438     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17439     initial_listsv_len = SvCUR(listsv);
17440     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
17441
17442     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17443
17444     assert(RExC_parse <= RExC_end);
17445
17446     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
17447         RExC_parse++;
17448         invert = TRUE;
17449         allow_mutiple_chars = FALSE;
17450         MARK_NAUGHTY(1);
17451         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17452     }
17453
17454     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17455     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17456         int maybe_class = handle_possible_posix(pRExC_state,
17457                                                 RExC_parse,
17458                                                 &not_posix_region_end,
17459                                                 NULL,
17460                                                 TRUE /* checking only */);
17461         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17462             ckWARN4reg(not_posix_region_end,
17463                     "POSIX syntax [%c %c] belongs inside character classes%s",
17464                     *RExC_parse, *RExC_parse,
17465                     (maybe_class == OOB_NAMEDCLASS)
17466                     ? ((POSIXCC_NOTYET(*RExC_parse))
17467                         ? " (but this one isn't implemented)"
17468                         : " (but this one isn't fully valid)")
17469                     : ""
17470                     );
17471         }
17472     }
17473
17474     /* If the caller wants us to just parse a single element, accomplish this
17475      * by faking the loop ending condition */
17476     if (stop_at_1 && RExC_end > RExC_parse) {
17477         stop_ptr = RExC_parse + 1;
17478     }
17479
17480     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17481     if (UCHARAT(RExC_parse) == ']')
17482         goto charclassloop;
17483
17484     while (1) {
17485
17486         if (   posix_warnings
17487             && av_tindex_skip_len_mg(posix_warnings) >= 0
17488             && RExC_parse > not_posix_region_end)
17489         {
17490             /* Warnings about posix class issues are considered tentative until
17491              * we are far enough along in the parse that we can no longer
17492              * change our mind, at which point we output them.  This is done
17493              * each time through the loop so that a later class won't zap them
17494              * before they have been dealt with. */
17495             output_posix_warnings(pRExC_state, posix_warnings);
17496         }
17497
17498         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17499
17500         if  (RExC_parse >= stop_ptr) {
17501             break;
17502         }
17503
17504         if  (UCHARAT(RExC_parse) == ']') {
17505             break;
17506         }
17507
17508       charclassloop:
17509
17510         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17511         save_value = value;
17512         save_prevvalue = prevvalue;
17513
17514         if (!range) {
17515             rangebegin = RExC_parse;
17516             element_count++;
17517             non_portable_endpoint = 0;
17518         }
17519         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17520             value = utf8n_to_uvchr((U8*)RExC_parse,
17521                                    RExC_end - RExC_parse,
17522                                    &numlen, UTF8_ALLOW_DEFAULT);
17523             RExC_parse += numlen;
17524         }
17525         else
17526             value = UCHARAT(RExC_parse++);
17527
17528         if (value == '[') {
17529             char * posix_class_end;
17530             namedclass = handle_possible_posix(pRExC_state,
17531                                                RExC_parse,
17532                                                &posix_class_end,
17533                                                do_posix_warnings ? &posix_warnings : NULL,
17534                                                FALSE    /* die if error */);
17535             if (namedclass > OOB_NAMEDCLASS) {
17536
17537                 /* If there was an earlier attempt to parse this particular
17538                  * posix class, and it failed, it was a false alarm, as this
17539                  * successful one proves */
17540                 if (   posix_warnings
17541                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17542                     && not_posix_region_end >= RExC_parse
17543                     && not_posix_region_end <= posix_class_end)
17544                 {
17545                     av_undef(posix_warnings);
17546                 }
17547
17548                 RExC_parse = posix_class_end;
17549             }
17550             else if (namedclass == OOB_NAMEDCLASS) {
17551                 not_posix_region_end = posix_class_end;
17552             }
17553             else {
17554                 namedclass = OOB_NAMEDCLASS;
17555             }
17556         }
17557         else if (   RExC_parse - 1 > not_posix_region_end
17558                  && MAYBE_POSIXCC(value))
17559         {
17560             (void) handle_possible_posix(
17561                         pRExC_state,
17562                         RExC_parse - 1,  /* -1 because parse has already been
17563                                             advanced */
17564                         &not_posix_region_end,
17565                         do_posix_warnings ? &posix_warnings : NULL,
17566                         TRUE /* checking only */);
17567         }
17568         else if (  strict && ! skip_white
17569                  && (   _generic_isCC(value, _CC_VERTSPACE)
17570                      || is_VERTWS_cp_high(value)))
17571         {
17572             vFAIL("Literal vertical space in [] is illegal except under /x");
17573         }
17574         else if (value == '\\') {
17575             /* Is a backslash; get the code point of the char after it */
17576
17577             if (RExC_parse >= RExC_end) {
17578                 vFAIL("Unmatched [");
17579             }
17580
17581             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17582                 value = utf8n_to_uvchr((U8*)RExC_parse,
17583                                    RExC_end - RExC_parse,
17584                                    &numlen, UTF8_ALLOW_DEFAULT);
17585                 RExC_parse += numlen;
17586             }
17587             else
17588                 value = UCHARAT(RExC_parse++);
17589
17590             /* Some compilers cannot handle switching on 64-bit integer
17591              * values, therefore value cannot be an UV.  Yes, this will
17592              * be a problem later if we want switch on Unicode.
17593              * A similar issue a little bit later when switching on
17594              * namedclass. --jhi */
17595
17596             /* If the \ is escaping white space when white space is being
17597              * skipped, it means that that white space is wanted literally, and
17598              * is already in 'value'.  Otherwise, need to translate the escape
17599              * into what it signifies. */
17600             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17601                 const char * message;
17602                 U32 packed_warn;
17603                 U8 grok_c_char;
17604
17605             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
17606             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
17607             case 's':   namedclass = ANYOF_SPACE;       break;
17608             case 'S':   namedclass = ANYOF_NSPACE;      break;
17609             case 'd':   namedclass = ANYOF_DIGIT;       break;
17610             case 'D':   namedclass = ANYOF_NDIGIT;      break;
17611             case 'v':   namedclass = ANYOF_VERTWS;      break;
17612             case 'V':   namedclass = ANYOF_NVERTWS;     break;
17613             case 'h':   namedclass = ANYOF_HORIZWS;     break;
17614             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
17615             case 'N':  /* Handle \N{NAME} in class */
17616                 {
17617                     const char * const backslash_N_beg = RExC_parse - 2;
17618                     int cp_count;
17619
17620                     if (! grok_bslash_N(pRExC_state,
17621                                         NULL,      /* No regnode */
17622                                         &value,    /* Yes single value */
17623                                         &cp_count, /* Multiple code pt count */
17624                                         flagp,
17625                                         strict,
17626                                         depth)
17627                     ) {
17628
17629                         if (*flagp & NEED_UTF8)
17630                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17631
17632                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17633
17634                         if (cp_count < 0) {
17635                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17636                         }
17637                         else if (cp_count == 0) {
17638                             ckWARNreg(RExC_parse,
17639                               "Ignoring zero length \\N{} in character class");
17640                         }
17641                         else { /* cp_count > 1 */
17642                             assert(cp_count > 1);
17643                             if (! RExC_in_multi_char_class) {
17644                                 if ( ! allow_mutiple_chars
17645                                     || invert
17646                                     || range
17647                                     || *RExC_parse == '-')
17648                                 {
17649                                     if (strict) {
17650                                         RExC_parse--;
17651                                         vFAIL("\\N{} here is restricted to one character");
17652                                     }
17653                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17654                                     break; /* <value> contains the first code
17655                                               point. Drop out of the switch to
17656                                               process it */
17657                                 }
17658                                 else {
17659                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17660                                                  RExC_parse - backslash_N_beg);
17661                                     multi_char_matches
17662                                         = add_multi_match(multi_char_matches,
17663                                                           multi_char_N,
17664                                                           cp_count);
17665                                 }
17666                             }
17667                         } /* End of cp_count != 1 */
17668
17669                         /* This element should not be processed further in this
17670                          * class */
17671                         element_count--;
17672                         value = save_value;
17673                         prevvalue = save_prevvalue;
17674                         continue;   /* Back to top of loop to get next char */
17675                     }
17676
17677                     /* Here, is a single code point, and <value> contains it */
17678                     unicode_range = TRUE;   /* \N{} are Unicode */
17679                 }
17680                 break;
17681             case 'p':
17682             case 'P':
17683                 {
17684                 char *e;
17685
17686                 if (RExC_pm_flags & PMf_WILDCARD) {
17687                     RExC_parse++;
17688                     /* diag_listed_as: Use of %s is not allowed in Unicode
17689                        property wildcard subpatterns in regex; marked by <--
17690                        HERE in m/%s/ */
17691                     vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17692                            " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17693                 }
17694
17695                 /* \p means they want Unicode semantics */
17696                 REQUIRE_UNI_RULES(flagp, 0);
17697
17698                 if (RExC_parse >= RExC_end)
17699                     vFAIL2("Empty \\%c", (U8)value);
17700                 if (*RExC_parse == '{') {
17701                     const U8 c = (U8)value;
17702                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17703                     if (!e) {
17704                         RExC_parse++;
17705                         vFAIL2("Missing right brace on \\%c{}", c);
17706                     }
17707
17708                     RExC_parse++;
17709
17710                     /* White space is allowed adjacent to the braces and after
17711                      * any '^', even when not under /x */
17712                     while (isSPACE(*RExC_parse)) {
17713                          RExC_parse++;
17714                     }
17715
17716                     if (UCHARAT(RExC_parse) == '^') {
17717
17718                         /* toggle.  (The rhs xor gets the single bit that
17719                          * differs between P and p; the other xor inverts just
17720                          * that bit) */
17721                         value ^= 'P' ^ 'p';
17722
17723                         RExC_parse++;
17724                         while (isSPACE(*RExC_parse)) {
17725                             RExC_parse++;
17726                         }
17727                     }
17728
17729                     if (e == RExC_parse)
17730                         vFAIL2("Empty \\%c{}", c);
17731
17732                     n = e - RExC_parse;
17733                     while (isSPACE(*(RExC_parse + n - 1)))
17734                         n--;
17735
17736                 }   /* The \p isn't immediately followed by a '{' */
17737                 else if (! isALPHA(*RExC_parse)) {
17738                     RExC_parse += (UTF)
17739                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17740                                   : 1;
17741                     vFAIL2("Character following \\%c must be '{' or a "
17742                            "single-character Unicode property name",
17743                            (U8) value);
17744                 }
17745                 else {
17746                     e = RExC_parse;
17747                     n = 1;
17748                 }
17749                 {
17750                     char* name = RExC_parse;
17751
17752                     /* Any message returned about expanding the definition */
17753                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17754
17755                     /* If set TRUE, the property is user-defined as opposed to
17756                      * official Unicode */
17757                     bool user_defined = FALSE;
17758                     AV * strings = NULL;
17759
17760                     SV * prop_definition = parse_uniprop_string(
17761                                             name, n, UTF, FOLD,
17762                                             FALSE, /* This is compile-time */
17763
17764                                             /* We can't defer this defn when
17765                                              * the full result is required in
17766                                              * this call */
17767                                             ! cBOOL(ret_invlist),
17768
17769                                             &strings,
17770                                             &user_defined,
17771                                             msg,
17772                                             0 /* Base level */
17773                                            );
17774                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17775                         assert(prop_definition == NULL);
17776                         RExC_parse = e + 1;
17777                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17778                                                thing so, or else the display is
17779                                                mojibake */
17780                             RExC_utf8 = TRUE;
17781                         }
17782                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17783                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17784                                     SvCUR(msg), SvPVX(msg)));
17785                     }
17786
17787                     assert(prop_definition || strings);
17788
17789                     if (strings) {
17790                         if (ret_invlist) {
17791                             if (! prop_definition) {
17792                                 RExC_parse = e + 1;
17793                                 vFAIL("Unicode string properties are not implemented in (?[...])");
17794                             }
17795                             else {
17796                                 ckWARNreg(e + 1,
17797                                     "Using just the single character results"
17798                                     " returned by \\p{} in (?[...])");
17799                             }
17800                         }
17801                         else if (! RExC_in_multi_char_class) {
17802                             if (invert ^ (value == 'P')) {
17803                                 RExC_parse = e + 1;
17804                                 vFAIL("Inverting a character class which contains"
17805                                     " a multi-character sequence is illegal");
17806                             }
17807
17808                             /* For each multi-character string ... */
17809                             while (av_count(strings) > 0) {
17810                                 /* ... Each entry is itself an array of code
17811                                 * points. */
17812                                 AV * this_string = (AV *) av_shift( strings);
17813                                 STRLEN cp_count = av_count(this_string);
17814                                 SV * final = newSV(cp_count * 4);
17815                                 SvPVCLEAR(final);
17816
17817                                 /* Create another string of sequences of \x{...} */
17818                                 while (av_count(this_string) > 0) {
17819                                     SV * character = av_shift(this_string);
17820                                     UV cp = SvUV(character);
17821
17822                                     if (cp > 255) {
17823                                         REQUIRE_UTF8(flagp);
17824                                     }
17825                                     Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17826                                                                         cp);
17827                                     SvREFCNT_dec_NN(character);
17828                                 }
17829                                 SvREFCNT_dec_NN(this_string);
17830
17831                                 /* And add that to the list of such things */
17832                                 multi_char_matches
17833                                             = add_multi_match(multi_char_matches,
17834                                                             final,
17835                                                             cp_count);
17836                             }
17837                         }
17838                         SvREFCNT_dec_NN(strings);
17839                     }
17840
17841                     if (! prop_definition) {    /* If we got only a string,
17842                                                    this iteration didn't really
17843                                                    find a character */
17844                         element_count--;
17845                     }
17846                     else if (! is_invlist(prop_definition)) {
17847
17848                         /* Here, the definition isn't known, so we have gotten
17849                          * returned a string that will be evaluated if and when
17850                          * encountered at runtime.  We add it to the list of
17851                          * such properties, along with whether it should be
17852                          * complemented or not */
17853                         if (value == 'P') {
17854                             sv_catpvs(listsv, "!");
17855                         }
17856                         else {
17857                             sv_catpvs(listsv, "+");
17858                         }
17859                         sv_catsv(listsv, prop_definition);
17860
17861                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17862
17863                         /* We don't know yet what this matches, so have to flag
17864                          * it */
17865                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17866                     }
17867                     else {
17868                         assert (prop_definition && is_invlist(prop_definition));
17869
17870                         /* Here we do have the complete property definition
17871                          *
17872                          * Temporary workaround for [perl #133136].  For this
17873                          * precise input that is in the .t that is failing,
17874                          * load utf8.pm, which is what the test wants, so that
17875                          * that .t passes */
17876                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17877                                         "foo\\p{Alnum}")
17878                             && ! hv_common(GvHVn(PL_incgv),
17879                                            NULL,
17880                                            "utf8.pm", sizeof("utf8.pm") - 1,
17881                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17882                         {
17883                             require_pv("utf8.pm");
17884                         }
17885
17886                         if (! user_defined &&
17887                             /* We warn on matching an above-Unicode code point
17888                              * if the match would return true, except don't
17889                              * warn for \p{All}, which has exactly one element
17890                              * = 0 */
17891                             (_invlist_contains_cp(prop_definition, 0x110000)
17892                                 && (! (_invlist_len(prop_definition) == 1
17893                                        && *invlist_array(prop_definition) == 0))))
17894                         {
17895                             warn_super = TRUE;
17896                         }
17897
17898                         /* Invert if asking for the complement */
17899                         if (value == 'P') {
17900                             _invlist_union_complement_2nd(properties,
17901                                                           prop_definition,
17902                                                           &properties);
17903                         }
17904                         else {
17905                             _invlist_union(properties, prop_definition, &properties);
17906                         }
17907                     }
17908                 }
17909
17910                 RExC_parse = e + 1;
17911                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17912                                                 named */
17913                 }
17914                 break;
17915             case 'n':   value = '\n';                   break;
17916             case 'r':   value = '\r';                   break;
17917             case 't':   value = '\t';                   break;
17918             case 'f':   value = '\f';                   break;
17919             case 'b':   value = '\b';                   break;
17920             case 'e':   value = ESC_NATIVE;             break;
17921             case 'a':   value = '\a';                   break;
17922             case 'o':
17923                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17924                 if (! grok_bslash_o(&RExC_parse,
17925                                             RExC_end,
17926                                             &value,
17927                                             &message,
17928                                             &packed_warn,
17929                                             strict,
17930                                             cBOOL(range), /* MAX_UV allowed for range
17931                                                       upper limit */
17932                                             UTF))
17933                 {
17934                     vFAIL(message);
17935                 }
17936                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17937                     warn_non_literal_string(RExC_parse, packed_warn, message);
17938                 }
17939
17940                 if (value < 256) {
17941                     non_portable_endpoint++;
17942                 }
17943                 break;
17944             case 'x':
17945                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17946                 if (!  grok_bslash_x(&RExC_parse,
17947                                             RExC_end,
17948                                             &value,
17949                                             &message,
17950                                             &packed_warn,
17951                                             strict,
17952                                             cBOOL(range), /* MAX_UV allowed for range
17953                                                       upper limit */
17954                                             UTF))
17955                 {
17956                     vFAIL(message);
17957                 }
17958                 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17959                     warn_non_literal_string(RExC_parse, packed_warn, message);
17960                 }
17961
17962                 if (value < 256) {
17963                     non_portable_endpoint++;
17964                 }
17965                 break;
17966             case 'c':
17967                 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17968                                                                 &packed_warn))
17969                 {
17970                     /* going to die anyway; point to exact spot of
17971                         * failure */
17972                     RExC_parse += (UTF)
17973                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17974                                   : 1;
17975                     vFAIL(message);
17976                 }
17977
17978                 value = grok_c_char;
17979                 RExC_parse++;
17980                 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17981                     warn_non_literal_string(RExC_parse, packed_warn, message);
17982                 }
17983
17984                 non_portable_endpoint++;
17985                 break;
17986             case '0': case '1': case '2': case '3': case '4':
17987             case '5': case '6': case '7':
17988                 {
17989                     /* Take 1-3 octal digits */
17990                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17991                               | PERL_SCAN_NOTIFY_ILLDIGIT;
17992                     numlen = (strict) ? 4 : 3;
17993                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17994                     RExC_parse += numlen;
17995                     if (numlen != 3) {
17996                         if (strict) {
17997                             RExC_parse += (UTF)
17998                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17999                                           : 1;
18000                             vFAIL("Need exactly 3 octal digits");
18001                         }
18002                         else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
18003                                  && RExC_parse < RExC_end
18004                                  && isDIGIT(*RExC_parse)
18005                                  && ckWARN(WARN_REGEXP))
18006                         {
18007                             reg_warn_non_literal_string(
18008                                  RExC_parse + 1,
18009                                  form_alien_digit_msg(8, numlen, RExC_parse,
18010                                                         RExC_end, UTF, FALSE));
18011                         }
18012                     }
18013                     if (value < 256) {
18014                         non_portable_endpoint++;
18015                     }
18016                     break;
18017                 }
18018             default:
18019                 /* Allow \_ to not give an error */
18020                 if (isWORDCHAR(value) && value != '_') {
18021                     if (strict) {
18022                         vFAIL2("Unrecognized escape \\%c in character class",
18023                                (int)value);
18024                     }
18025                     else {
18026                         ckWARN2reg(RExC_parse,
18027                             "Unrecognized escape \\%c in character class passed through",
18028                             (int)value);
18029                     }
18030                 }
18031                 break;
18032             }   /* End of switch on char following backslash */
18033         } /* end of handling backslash escape sequences */
18034
18035         /* Here, we have the current token in 'value' */
18036
18037         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18038             U8 classnum;
18039
18040             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
18041              * literal, as is the character that began the false range, i.e.
18042              * the 'a' in the examples */
18043             if (range) {
18044                 const int w = (RExC_parse >= rangebegin)
18045                                 ? RExC_parse - rangebegin
18046                                 : 0;
18047                 if (strict) {
18048                     vFAIL2utf8f(
18049                         "False [] range \"%" UTF8f "\"",
18050                         UTF8fARG(UTF, w, rangebegin));
18051                 }
18052                 else {
18053                     ckWARN2reg(RExC_parse,
18054                         "False [] range \"%" UTF8f "\"",
18055                         UTF8fARG(UTF, w, rangebegin));
18056                     cp_list = add_cp_to_invlist(cp_list, '-');
18057                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18058                                                             prevvalue);
18059                 }
18060
18061                 range = 0; /* this was not a true range */
18062                 element_count += 2; /* So counts for three values */
18063             }
18064
18065             classnum = namedclass_to_classnum(namedclass);
18066
18067             if (LOC && namedclass < ANYOF_POSIXL_MAX
18068 #ifndef HAS_ISASCII
18069                 && classnum != _CC_ASCII
18070 #endif
18071             ) {
18072                 SV* scratch_list = NULL;
18073
18074                 /* What the Posix classes (like \w, [:space:]) match isn't
18075                  * generally knowable under locale until actual match time.  A
18076                  * special node is used for these which has extra space for a
18077                  * bitmap, with a bit reserved for each named class that is to
18078                  * be matched against.  (This isn't needed for \p{} and
18079                  * pseudo-classes, as they are not affected by locale, and
18080                  * hence are dealt with separately.)  However, if a named class
18081                  * and its complement are both present, then it matches
18082                  * everything, and there is no runtime dependency.  Odd numbers
18083                  * are the complements of the next lower number, so xor works.
18084                  * (Note that something like [\w\D] should match everything,
18085                  * because \d should be a proper subset of \w.  But rather than
18086                  * trust that the locale is well behaved, we leave this to
18087                  * runtime to sort out) */
18088                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18089                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18090                     POSIXL_ZERO(posixl);
18091                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18092                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18093                     continue;   /* We could ignore the rest of the class, but
18094                                    best to parse it for any errors */
18095                 }
18096                 else { /* Here, isn't the complement of any already parsed
18097                           class */
18098                     POSIXL_SET(posixl, namedclass);
18099                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18100                     anyof_flags |= ANYOF_MATCHES_POSIXL;
18101
18102                     /* The above-Latin1 characters are not subject to locale
18103                      * rules.  Just add them to the unconditionally-matched
18104                      * list */
18105
18106                     /* Get the list of the above-Latin1 code points this
18107                      * matches */
18108                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18109                                             PL_XPosix_ptrs[classnum],
18110
18111                                             /* Odd numbers are complements,
18112                                              * like NDIGIT, NASCII, ... */
18113                                             namedclass % 2 != 0,
18114                                             &scratch_list);
18115                     /* Checking if 'cp_list' is NULL first saves an extra
18116                      * clone.  Its reference count will be decremented at the
18117                      * next union, etc, or if this is the only instance, at the
18118                      * end of the routine */
18119                     if (! cp_list) {
18120                         cp_list = scratch_list;
18121                     }
18122                     else {
18123                         _invlist_union(cp_list, scratch_list, &cp_list);
18124                         SvREFCNT_dec_NN(scratch_list);
18125                     }
18126                     continue;   /* Go get next character */
18127                 }
18128             }
18129             else {
18130
18131                 /* Here, is not /l, or is a POSIX class for which /l doesn't
18132                  * matter (or is a Unicode property, which is skipped here). */
18133                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
18134                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18135
18136                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
18137                          * nor /l make a difference in what these match,
18138                          * therefore we just add what they match to cp_list. */
18139                         if (classnum != _CC_VERTSPACE) {
18140                             assert(   namedclass == ANYOF_HORIZWS
18141                                    || namedclass == ANYOF_NHORIZWS);
18142
18143                             /* It turns out that \h is just a synonym for
18144                              * XPosixBlank */
18145                             classnum = _CC_BLANK;
18146                         }
18147
18148                         _invlist_union_maybe_complement_2nd(
18149                                 cp_list,
18150                                 PL_XPosix_ptrs[classnum],
18151                                 namedclass % 2 != 0,    /* Complement if odd
18152                                                           (NHORIZWS, NVERTWS)
18153                                                         */
18154                                 &cp_list);
18155                     }
18156                 }
18157                 else if (   AT_LEAST_UNI_SEMANTICS
18158                          || classnum == _CC_ASCII
18159                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
18160                                                    || classnum == _CC_XDIGIT)))
18161                 {
18162                     /* We usually have to worry about /d affecting what POSIX
18163                      * classes match, with special code needed because we won't
18164                      * know until runtime what all matches.  But there is no
18165                      * extra work needed under /u and /a; and [:ascii:] is
18166                      * unaffected by /d; and :digit: and :xdigit: don't have
18167                      * runtime differences under /d.  So we can special case
18168                      * these, and avoid some extra work below, and at runtime.
18169                      * */
18170                     _invlist_union_maybe_complement_2nd(
18171                                                      simple_posixes,
18172                                                       ((AT_LEAST_ASCII_RESTRICTED)
18173                                                        ? PL_Posix_ptrs[classnum]
18174                                                        : PL_XPosix_ptrs[classnum]),
18175                                                      namedclass % 2 != 0,
18176                                                      &simple_posixes);
18177                 }
18178                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
18179                            complement and use nposixes */
18180                     SV** posixes_ptr = namedclass % 2 == 0
18181                                        ? &posixes
18182                                        : &nposixes;
18183                     _invlist_union_maybe_complement_2nd(
18184                                                      *posixes_ptr,
18185                                                      PL_XPosix_ptrs[classnum],
18186                                                      namedclass % 2 != 0,
18187                                                      posixes_ptr);
18188                 }
18189             }
18190         } /* end of namedclass \blah */
18191
18192         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18193
18194         /* If 'range' is set, 'value' is the ending of a range--check its
18195          * validity.  (If value isn't a single code point in the case of a
18196          * range, we should have figured that out above in the code that
18197          * catches false ranges).  Later, we will handle each individual code
18198          * point in the range.  If 'range' isn't set, this could be the
18199          * beginning of a range, so check for that by looking ahead to see if
18200          * the next real character to be processed is the range indicator--the
18201          * minus sign */
18202
18203         if (range) {
18204 #ifdef EBCDIC
18205             /* For unicode ranges, we have to test that the Unicode as opposed
18206              * to the native values are not decreasing.  (Above 255, there is
18207              * no difference between native and Unicode) */
18208             if (unicode_range && prevvalue < 255 && value < 255) {
18209                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18210                     goto backwards_range;
18211                 }
18212             }
18213             else
18214 #endif
18215             if (prevvalue > value) /* b-a */ {
18216                 int w;
18217 #ifdef EBCDIC
18218               backwards_range:
18219 #endif
18220                 w = RExC_parse - rangebegin;
18221                 vFAIL2utf8f(
18222                     "Invalid [] range \"%" UTF8f "\"",
18223                     UTF8fARG(UTF, w, rangebegin));
18224                 NOT_REACHED; /* NOTREACHED */
18225             }
18226         }
18227         else {
18228             prevvalue = value; /* save the beginning of the potential range */
18229             if (! stop_at_1     /* Can't be a range if parsing just one thing */
18230                 && *RExC_parse == '-')
18231             {
18232                 char* next_char_ptr = RExC_parse + 1;
18233
18234                 /* Get the next real char after the '-' */
18235                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18236
18237                 /* If the '-' is at the end of the class (just before the ']',
18238                  * it is a literal minus; otherwise it is a range */
18239                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18240                     RExC_parse = next_char_ptr;
18241
18242                     /* a bad range like \w-, [:word:]- ? */
18243                     if (namedclass > OOB_NAMEDCLASS) {
18244                         if (strict || ckWARN(WARN_REGEXP)) {
18245                             const int w = RExC_parse >= rangebegin
18246                                           ?  RExC_parse - rangebegin
18247                                           : 0;
18248                             if (strict) {
18249                                 vFAIL4("False [] range \"%*.*s\"",
18250                                     w, w, rangebegin);
18251                             }
18252                             else {
18253                                 vWARN4(RExC_parse,
18254                                     "False [] range \"%*.*s\"",
18255                                     w, w, rangebegin);
18256                             }
18257                         }
18258                         cp_list = add_cp_to_invlist(cp_list, '-');
18259                         element_count++;
18260                     } else
18261                         range = 1;      /* yeah, it's a range! */
18262                     continue;   /* but do it the next time */
18263                 }
18264             }
18265         }
18266
18267         if (namedclass > OOB_NAMEDCLASS) {
18268             continue;
18269         }
18270
18271         /* Here, we have a single value this time through the loop, and
18272          * <prevvalue> is the beginning of the range, if any; or <value> if
18273          * not. */
18274
18275         /* non-Latin1 code point implies unicode semantics. */
18276         if (value > 255) {
18277             if (value > MAX_LEGAL_CP && (   value != UV_MAX
18278                                          || prevvalue > MAX_LEGAL_CP))
18279             {
18280                 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18281             }
18282             REQUIRE_UNI_RULES(flagp, 0);
18283             if (  ! silence_non_portable
18284                 &&  UNICODE_IS_PERL_EXTENDED(value)
18285                 &&  TO_OUTPUT_WARNINGS(RExC_parse))
18286             {
18287                 ckWARN2_non_literal_string(RExC_parse,
18288                                            packWARN(WARN_PORTABLE),
18289                                            PL_extended_cp_format,
18290                                            value);
18291             }
18292         }
18293
18294         /* Ready to process either the single value, or the completed range.
18295          * For single-valued non-inverted ranges, we consider the possibility
18296          * of multi-char folds.  (We made a conscious decision to not do this
18297          * for the other cases because it can often lead to non-intuitive
18298          * results.  For example, you have the peculiar case that:
18299          *  "s s" =~ /^[^\xDF]+$/i => Y
18300          *  "ss"  =~ /^[^\xDF]+$/i => N
18301          *
18302          * See [perl #89750] */
18303         if (FOLD && allow_mutiple_chars && value == prevvalue) {
18304             if (    value == LATIN_SMALL_LETTER_SHARP_S
18305                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18306                                                         value)))
18307             {
18308                 /* Here <value> is indeed a multi-char fold.  Get what it is */
18309
18310                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18311                 STRLEN foldlen;
18312
18313                 UV folded = _to_uni_fold_flags(
18314                                 value,
18315                                 foldbuf,
18316                                 &foldlen,
18317                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18318                                                    ? FOLD_FLAGS_NOMIX_ASCII
18319                                                    : 0)
18320                                 );
18321
18322                 /* Here, <folded> should be the first character of the
18323                  * multi-char fold of <value>, with <foldbuf> containing the
18324                  * whole thing.  But, if this fold is not allowed (because of
18325                  * the flags), <fold> will be the same as <value>, and should
18326                  * be processed like any other character, so skip the special
18327                  * handling */
18328                 if (folded != value) {
18329
18330                     /* Skip if we are recursed, currently parsing the class
18331                      * again.  Otherwise add this character to the list of
18332                      * multi-char folds. */
18333                     if (! RExC_in_multi_char_class) {
18334                         STRLEN cp_count = utf8_length(foldbuf,
18335                                                       foldbuf + foldlen);
18336                         SV* multi_fold = sv_2mortal(newSVpvs(""));
18337
18338                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18339
18340                         multi_char_matches
18341                                         = add_multi_match(multi_char_matches,
18342                                                           multi_fold,
18343                                                           cp_count);
18344
18345                     }
18346
18347                     /* This element should not be processed further in this
18348                      * class */
18349                     element_count--;
18350                     value = save_value;
18351                     prevvalue = save_prevvalue;
18352                     continue;
18353                 }
18354             }
18355         }
18356
18357         if (strict && ckWARN(WARN_REGEXP)) {
18358             if (range) {
18359
18360                 /* If the range starts above 255, everything is portable and
18361                  * likely to be so for any forseeable character set, so don't
18362                  * warn. */
18363                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18364                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18365                 }
18366                 else if (prevvalue != value) {
18367
18368                     /* Under strict, ranges that stop and/or end in an ASCII
18369                      * printable should have each end point be a portable value
18370                      * for it (preferably like 'A', but we don't warn if it is
18371                      * a (portable) Unicode name or code point), and the range
18372                      * must be all digits or all letters of the same case.
18373                      * Otherwise, the range is non-portable and unclear as to
18374                      * what it contains */
18375                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
18376                         && (          non_portable_endpoint
18377                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18378                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
18379                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
18380                     ))) {
18381                         vWARN(RExC_parse, "Ranges of ASCII printables should"
18382                                           " be some subset of \"0-9\","
18383                                           " \"A-Z\", or \"a-z\"");
18384                     }
18385                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18386                         SSize_t index_start;
18387                         SSize_t index_final;
18388
18389                         /* But the nature of Unicode and languages mean we
18390                          * can't do the same checks for above-ASCII ranges,
18391                          * except in the case of digit ones.  These should
18392                          * contain only digits from the same group of 10.  The
18393                          * ASCII case is handled just above.  Hence here, the
18394                          * range could be a range of digits.  First some
18395                          * unlikely special cases.  Grandfather in that a range
18396                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18397                          * if its starting value is one of the 10 digits prior
18398                          * to it.  This is because it is an alternate way of
18399                          * writing 19D1, and some people may expect it to be in
18400                          * that group.  But it is bad, because it won't give
18401                          * the expected results.  In Unicode 5.2 it was
18402                          * considered to be in that group (of 11, hence), but
18403                          * this was fixed in the next version */
18404
18405                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18406                             goto warn_bad_digit_range;
18407                         }
18408                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
18409                                           &&     value <= 0x1D7FF))
18410                         {
18411                             /* This is the only other case currently in Unicode
18412                              * where the algorithm below fails.  The code
18413                              * points just above are the end points of a single
18414                              * range containing only decimal digits.  It is 5
18415                              * different series of 0-9.  All other ranges of
18416                              * digits currently in Unicode are just a single
18417                              * series.  (And mktables will notify us if a later
18418                              * Unicode version breaks this.)
18419                              *
18420                              * If the range being checked is at most 9 long,
18421                              * and the digit values represented are in
18422                              * numerical order, they are from the same series.
18423                              * */
18424                             if (         value - prevvalue > 9
18425                                 ||    (((    value - 0x1D7CE) % 10)
18426                                      <= (prevvalue - 0x1D7CE) % 10))
18427                             {
18428                                 goto warn_bad_digit_range;
18429                             }
18430                         }
18431                         else {
18432
18433                             /* For all other ranges of digits in Unicode, the
18434                              * algorithm is just to check if both end points
18435                              * are in the same series, which is the same range.
18436                              * */
18437                             index_start = _invlist_search(
18438                                                     PL_XPosix_ptrs[_CC_DIGIT],
18439                                                     prevvalue);
18440
18441                             /* Warn if the range starts and ends with a digit,
18442                              * and they are not in the same group of 10. */
18443                             if (   index_start >= 0
18444                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18445                                 && (index_final =
18446                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18447                                                     value)) != index_start
18448                                 && index_final >= 0
18449                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18450                             {
18451                               warn_bad_digit_range:
18452                                 vWARN(RExC_parse, "Ranges of digits should be"
18453                                                   " from the same group of"
18454                                                   " 10");
18455                             }
18456                         }
18457                     }
18458                 }
18459             }
18460             if ((! range || prevvalue == value) && non_portable_endpoint) {
18461                 if (isPRINT_A(value)) {
18462                     char literal[3];
18463                     unsigned d = 0;
18464                     if (isBACKSLASHED_PUNCT(value)) {
18465                         literal[d++] = '\\';
18466                     }
18467                     literal[d++] = (char) value;
18468                     literal[d++] = '\0';
18469
18470                     vWARN4(RExC_parse,
18471                            "\"%.*s\" is more clearly written simply as \"%s\"",
18472                            (int) (RExC_parse - rangebegin),
18473                            rangebegin,
18474                            literal
18475                         );
18476                 }
18477                 else if (isMNEMONIC_CNTRL(value)) {
18478                     vWARN4(RExC_parse,
18479                            "\"%.*s\" is more clearly written simply as \"%s\"",
18480                            (int) (RExC_parse - rangebegin),
18481                            rangebegin,
18482                            cntrl_to_mnemonic((U8) value)
18483                         );
18484                 }
18485             }
18486         }
18487
18488         /* Deal with this element of the class */
18489
18490 #ifndef EBCDIC
18491         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18492                                                     prevvalue, value);
18493 #else
18494         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18495          * that don't require special handling, we can just add the range like
18496          * we do for ASCII platforms */
18497         if ((UNLIKELY(prevvalue == 0) && value >= 255)
18498             || ! (prevvalue < 256
18499                     && (unicode_range
18500                         || (! non_portable_endpoint
18501                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18502                                 || (isUPPER_A(prevvalue)
18503                                     && isUPPER_A(value)))))))
18504         {
18505             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18506                                                         prevvalue, value);
18507         }
18508         else {
18509             /* Here, requires special handling.  This can be because it is a
18510              * range whose code points are considered to be Unicode, and so
18511              * must be individually translated into native, or because its a
18512              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18513              * EBCDIC, but we have defined them to include only the "expected"
18514              * upper or lower case ASCII alphabetics.  Subranges above 255 are
18515              * the same in native and Unicode, so can be added as a range */
18516             U8 start = NATIVE_TO_LATIN1(prevvalue);
18517             unsigned j;
18518             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18519             for (j = start; j <= end; j++) {
18520                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18521             }
18522             if (value > 255) {
18523                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18524                                                             256, value);
18525             }
18526         }
18527 #endif
18528
18529         range = 0; /* this range (if it was one) is done now */
18530     } /* End of loop through all the text within the brackets */
18531
18532     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18533         output_posix_warnings(pRExC_state, posix_warnings);
18534     }
18535
18536     /* If anything in the class expands to more than one character, we have to
18537      * deal with them by building up a substitute parse string, and recursively
18538      * calling reg() on it, instead of proceeding */
18539     if (multi_char_matches) {
18540         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18541         I32 cp_count;
18542         STRLEN len;
18543         char *save_end = RExC_end;
18544         char *save_parse = RExC_parse;
18545         char *save_start = RExC_start;
18546         Size_t constructed_prefix_len = 0; /* This gives the length of the
18547                                               constructed portion of the
18548                                               substitute parse. */
18549         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
18550                                        a "|" */
18551         I32 reg_flags;
18552
18553         assert(! invert);
18554         /* Only one level of recursion allowed */
18555         assert(RExC_copy_start_in_constructed == RExC_precomp);
18556
18557 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
18558            because too confusing */
18559         if (invert) {
18560             sv_catpvs(substitute_parse, "(?:");
18561         }
18562 #endif
18563
18564         /* Look at the longest strings first */
18565         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18566                         cp_count > 0;
18567                         cp_count--)
18568         {
18569
18570             if (av_exists(multi_char_matches, cp_count)) {
18571                 AV** this_array_ptr;
18572                 SV* this_sequence;
18573
18574                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18575                                                  cp_count, FALSE);
18576                 while ((this_sequence = av_pop(*this_array_ptr)) !=
18577                                                                 &PL_sv_undef)
18578                 {
18579                     if (! first_time) {
18580                         sv_catpvs(substitute_parse, "|");
18581                     }
18582                     first_time = FALSE;
18583
18584                     sv_catpv(substitute_parse, SvPVX(this_sequence));
18585                 }
18586             }
18587         }
18588
18589         /* If the character class contains anything else besides these
18590          * multi-character strings, have to include it in recursive parsing */
18591         if (element_count) {
18592             bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18593
18594             sv_catpvs(substitute_parse, "|");
18595             if (has_l_bracket) {    /* Add an [ if the original had one */
18596                 sv_catpvs(substitute_parse, "[");
18597             }
18598             constructed_prefix_len = SvCUR(substitute_parse);
18599             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18600
18601             /* Put in a closing ']' to match any opening one, but not if going
18602              * off the end, as otherwise we are adding something that really
18603              * isn't there */
18604             if (has_l_bracket && RExC_parse < RExC_end) {
18605                 sv_catpvs(substitute_parse, "]");
18606             }
18607         }
18608
18609         sv_catpvs(substitute_parse, ")");
18610 #if 0
18611         if (invert) {
18612             /* This is a way to get the parse to skip forward a whole named
18613              * sequence instead of matching the 2nd character when it fails the
18614              * first */
18615             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18616         }
18617 #endif
18618
18619         /* Set up the data structure so that any errors will be properly
18620          * reported.  See the comments at the definition of
18621          * REPORT_LOCATION_ARGS for details */
18622         RExC_copy_start_in_input = (char *) orig_parse;
18623         RExC_start = RExC_parse = SvPV(substitute_parse, len);
18624         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18625         RExC_end = RExC_parse + len;
18626         RExC_in_multi_char_class = 1;
18627
18628         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18629
18630         *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
18631
18632         /* And restore so can parse the rest of the pattern */
18633         RExC_parse = save_parse;
18634         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18635         RExC_end = save_end;
18636         RExC_in_multi_char_class = 0;
18637         SvREFCNT_dec_NN(multi_char_matches);
18638         return ret;
18639     }
18640
18641     /* If folding, we calculate all characters that could fold to or from the
18642      * ones already on the list */
18643     if (cp_foldable_list) {
18644         if (FOLD) {
18645             UV start, end;      /* End points of code point ranges */
18646
18647             SV* fold_intersection = NULL;
18648             SV** use_list;
18649
18650             /* Our calculated list will be for Unicode rules.  For locale
18651              * matching, we have to keep a separate list that is consulted at
18652              * runtime only when the locale indicates Unicode rules (and we
18653              * don't include potential matches in the ASCII/Latin1 range, as
18654              * any code point could fold to any other, based on the run-time
18655              * locale).   For non-locale, we just use the general list */
18656             if (LOC) {
18657                 use_list = &only_utf8_locale_list;
18658             }
18659             else {
18660                 use_list = &cp_list;
18661             }
18662
18663             /* Only the characters in this class that participate in folds need
18664              * be checked.  Get the intersection of this class and all the
18665              * possible characters that are foldable.  This can quickly narrow
18666              * down a large class */
18667             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18668                                   &fold_intersection);
18669
18670             /* Now look at the foldable characters in this class individually */
18671             invlist_iterinit(fold_intersection);
18672             while (invlist_iternext(fold_intersection, &start, &end)) {
18673                 UV j;
18674                 UV folded;
18675
18676                 /* Look at every character in the range */
18677                 for (j = start; j <= end; j++) {
18678                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18679                     STRLEN foldlen;
18680                     unsigned int k;
18681                     Size_t folds_count;
18682                     U32 first_fold;
18683                     const U32 * remaining_folds;
18684
18685                     if (j < 256) {
18686
18687                         /* Under /l, we don't know what code points below 256
18688                          * fold to, except we do know the MICRO SIGN folds to
18689                          * an above-255 character if the locale is UTF-8, so we
18690                          * add it to the special list (in *use_list)  Otherwise
18691                          * we know now what things can match, though some folds
18692                          * are valid under /d only if the target is UTF-8.
18693                          * Those go in a separate list */
18694                         if (      IS_IN_SOME_FOLD_L1(j)
18695                             && ! (LOC && j != MICRO_SIGN))
18696                         {
18697
18698                             /* ASCII is always matched; non-ASCII is matched
18699                              * only under Unicode rules (which could happen
18700                              * under /l if the locale is a UTF-8 one */
18701                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18702                                 *use_list = add_cp_to_invlist(*use_list,
18703                                                             PL_fold_latin1[j]);
18704                             }
18705                             else if (j != PL_fold_latin1[j]) {
18706                                 upper_latin1_only_utf8_matches
18707                                         = add_cp_to_invlist(
18708                                                 upper_latin1_only_utf8_matches,
18709                                                 PL_fold_latin1[j]);
18710                             }
18711                         }
18712
18713                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18714                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18715                         {
18716                             add_above_Latin1_folds(pRExC_state,
18717                                                    (U8) j,
18718                                                    use_list);
18719                         }
18720                         continue;
18721                     }
18722
18723                     /* Here is an above Latin1 character.  We don't have the
18724                      * rules hard-coded for it.  First, get its fold.  This is
18725                      * the simple fold, as the multi-character folds have been
18726                      * handled earlier and separated out */
18727                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18728                                                         (ASCII_FOLD_RESTRICTED)
18729                                                         ? FOLD_FLAGS_NOMIX_ASCII
18730                                                         : 0);
18731
18732                     /* Single character fold of above Latin1.  Add everything
18733                      * in its fold closure to the list that this node should
18734                      * match. */
18735                     folds_count = _inverse_folds(folded, &first_fold,
18736                                                     &remaining_folds);
18737                     for (k = 0; k <= folds_count; k++) {
18738                         UV c = (k == 0)     /* First time through use itself */
18739                                 ? folded
18740                                 : (k == 1)  /* 2nd time use, the first fold */
18741                                    ? first_fold
18742
18743                                      /* Then the remaining ones */
18744                                    : remaining_folds[k-2];
18745
18746                         /* /aa doesn't allow folds between ASCII and non- */
18747                         if ((   ASCII_FOLD_RESTRICTED
18748                             && (isASCII(c) != isASCII(j))))
18749                         {
18750                             continue;
18751                         }
18752
18753                         /* Folds under /l which cross the 255/256 boundary are
18754                          * added to a separate list.  (These are valid only
18755                          * when the locale is UTF-8.) */
18756                         if (c < 256 && LOC) {
18757                             *use_list = add_cp_to_invlist(*use_list, c);
18758                             continue;
18759                         }
18760
18761                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18762                         {
18763                             cp_list = add_cp_to_invlist(cp_list, c);
18764                         }
18765                         else {
18766                             /* Similarly folds involving non-ascii Latin1
18767                              * characters under /d are added to their list */
18768                             upper_latin1_only_utf8_matches
18769                                     = add_cp_to_invlist(
18770                                                 upper_latin1_only_utf8_matches,
18771                                                 c);
18772                         }
18773                     }
18774                 }
18775             }
18776             SvREFCNT_dec_NN(fold_intersection);
18777         }
18778
18779         /* Now that we have finished adding all the folds, there is no reason
18780          * to keep the foldable list separate */
18781         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18782         SvREFCNT_dec_NN(cp_foldable_list);
18783     }
18784
18785     /* And combine the result (if any) with any inversion lists from posix
18786      * classes.  The lists are kept separate up to now because we don't want to
18787      * fold the classes */
18788     if (simple_posixes) {   /* These are the classes known to be unaffected by
18789                                /a, /aa, and /d */
18790         if (cp_list) {
18791             _invlist_union(cp_list, simple_posixes, &cp_list);
18792             SvREFCNT_dec_NN(simple_posixes);
18793         }
18794         else {
18795             cp_list = simple_posixes;
18796         }
18797     }
18798     if (posixes || nposixes) {
18799         if (! DEPENDS_SEMANTICS) {
18800
18801             /* For everything but /d, we can just add the current 'posixes' and
18802              * 'nposixes' to the main list */
18803             if (posixes) {
18804                 if (cp_list) {
18805                     _invlist_union(cp_list, posixes, &cp_list);
18806                     SvREFCNT_dec_NN(posixes);
18807                 }
18808                 else {
18809                     cp_list = posixes;
18810                 }
18811             }
18812             if (nposixes) {
18813                 if (cp_list) {
18814                     _invlist_union(cp_list, nposixes, &cp_list);
18815                     SvREFCNT_dec_NN(nposixes);
18816                 }
18817                 else {
18818                     cp_list = nposixes;
18819                 }
18820             }
18821         }
18822         else {
18823             /* Under /d, things like \w match upper Latin1 characters only if
18824              * the target string is in UTF-8.  But things like \W match all the
18825              * upper Latin1 characters if the target string is not in UTF-8.
18826              *
18827              * Handle the case with something like \W separately */
18828             if (nposixes) {
18829                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18830
18831                 /* A complemented posix class matches all upper Latin1
18832                  * characters if not in UTF-8.  And it matches just certain
18833                  * ones when in UTF-8.  That means those certain ones are
18834                  * matched regardless, so can just be added to the
18835                  * unconditional list */
18836                 if (cp_list) {
18837                     _invlist_union(cp_list, nposixes, &cp_list);
18838                     SvREFCNT_dec_NN(nposixes);
18839                     nposixes = NULL;
18840                 }
18841                 else {
18842                     cp_list = nposixes;
18843                 }
18844
18845                 /* Likewise for 'posixes' */
18846                 _invlist_union(posixes, cp_list, &cp_list);
18847                 SvREFCNT_dec(posixes);
18848
18849                 /* Likewise for anything else in the range that matched only
18850                  * under UTF-8 */
18851                 if (upper_latin1_only_utf8_matches) {
18852                     _invlist_union(cp_list,
18853                                    upper_latin1_only_utf8_matches,
18854                                    &cp_list);
18855                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18856                     upper_latin1_only_utf8_matches = NULL;
18857                 }
18858
18859                 /* If we don't match all the upper Latin1 characters regardless
18860                  * of UTF-8ness, we have to set a flag to match the rest when
18861                  * not in UTF-8 */
18862                 _invlist_subtract(only_non_utf8_list, cp_list,
18863                                   &only_non_utf8_list);
18864                 if (_invlist_len(only_non_utf8_list) != 0) {
18865                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18866                 }
18867                 SvREFCNT_dec_NN(only_non_utf8_list);
18868             }
18869             else {
18870                 /* Here there were no complemented posix classes.  That means
18871                  * the upper Latin1 characters in 'posixes' match only when the
18872                  * target string is in UTF-8.  So we have to add them to the
18873                  * list of those types of code points, while adding the
18874                  * remainder to the unconditional list.
18875                  *
18876                  * First calculate what they are */
18877                 SV* nonascii_but_latin1_properties = NULL;
18878                 _invlist_intersection(posixes, PL_UpperLatin1,
18879                                       &nonascii_but_latin1_properties);
18880
18881                 /* And add them to the final list of such characters. */
18882                 _invlist_union(upper_latin1_only_utf8_matches,
18883                                nonascii_but_latin1_properties,
18884                                &upper_latin1_only_utf8_matches);
18885
18886                 /* Remove them from what now becomes the unconditional list */
18887                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18888                                   &posixes);
18889
18890                 /* And add those unconditional ones to the final list */
18891                 if (cp_list) {
18892                     _invlist_union(cp_list, posixes, &cp_list);
18893                     SvREFCNT_dec_NN(posixes);
18894                     posixes = NULL;
18895                 }
18896                 else {
18897                     cp_list = posixes;
18898                 }
18899
18900                 SvREFCNT_dec(nonascii_but_latin1_properties);
18901
18902                 /* Get rid of any characters from the conditional list that we
18903                  * now know are matched unconditionally, which may make that
18904                  * list empty */
18905                 _invlist_subtract(upper_latin1_only_utf8_matches,
18906                                   cp_list,
18907                                   &upper_latin1_only_utf8_matches);
18908                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18909                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18910                     upper_latin1_only_utf8_matches = NULL;
18911                 }
18912             }
18913         }
18914     }
18915
18916     /* And combine the result (if any) with any inversion list from properties.
18917      * The lists are kept separate up to now so that we can distinguish the two
18918      * in regards to matching above-Unicode.  A run-time warning is generated
18919      * if a Unicode property is matched against a non-Unicode code point. But,
18920      * we allow user-defined properties to match anything, without any warning,
18921      * and we also suppress the warning if there is a portion of the character
18922      * class that isn't a Unicode property, and which matches above Unicode, \W
18923      * or [\x{110000}] for example.
18924      * (Note that in this case, unlike the Posix one above, there is no
18925      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18926      * forces Unicode semantics */
18927     if (properties) {
18928         if (cp_list) {
18929
18930             /* If it matters to the final outcome, see if a non-property
18931              * component of the class matches above Unicode.  If so, the
18932              * warning gets suppressed.  This is true even if just a single
18933              * such code point is specified, as, though not strictly correct if
18934              * another such code point is matched against, the fact that they
18935              * are using above-Unicode code points indicates they should know
18936              * the issues involved */
18937             if (warn_super) {
18938                 warn_super = ! (invert
18939                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18940             }
18941
18942             _invlist_union(properties, cp_list, &cp_list);
18943             SvREFCNT_dec_NN(properties);
18944         }
18945         else {
18946             cp_list = properties;
18947         }
18948
18949         if (warn_super) {
18950             anyof_flags
18951              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18952
18953             /* Because an ANYOF node is the only one that warns, this node
18954              * can't be optimized into something else */
18955             optimizable = FALSE;
18956         }
18957     }
18958
18959     /* Here, we have calculated what code points should be in the character
18960      * class.
18961      *
18962      * Now we can see about various optimizations.  Fold calculation (which we
18963      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18964      * would invert to include K, which under /i would match k, which it
18965      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18966      * folded until runtime */
18967
18968     /* If we didn't do folding, it's because some information isn't available
18969      * until runtime; set the run-time fold flag for these  We know to set the
18970      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18971      * at least one 0-255 range code point */
18972     if (LOC && FOLD) {
18973
18974         /* Some things on the list might be unconditionally included because of
18975          * other components.  Remove them, and clean up the list if it goes to
18976          * 0 elements */
18977         if (only_utf8_locale_list && cp_list) {
18978             _invlist_subtract(only_utf8_locale_list, cp_list,
18979                               &only_utf8_locale_list);
18980
18981             if (_invlist_len(only_utf8_locale_list) == 0) {
18982                 SvREFCNT_dec_NN(only_utf8_locale_list);
18983                 only_utf8_locale_list = NULL;
18984             }
18985         }
18986         if (    only_utf8_locale_list
18987             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18988                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18989         {
18990             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18991             anyof_flags
18992                  |= ANYOFL_FOLD
18993                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18994         }
18995         else if (cp_list && invlist_lowest(cp_list) < 256) {
18996             /* If nothing is below 256, has no locale dependency; otherwise it
18997              * does */
18998             anyof_flags |= ANYOFL_FOLD;
18999             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
19000         }
19001     }
19002     else if (   DEPENDS_SEMANTICS
19003              && (    upper_latin1_only_utf8_matches
19004                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
19005     {
19006         RExC_seen_d_op = TRUE;
19007         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
19008     }
19009
19010     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
19011      * compile time. */
19012     if (     cp_list
19013         &&   invert
19014         && ! has_runtime_dependency)
19015     {
19016         _invlist_invert(cp_list);
19017
19018         /* Clear the invert flag since have just done it here */
19019         invert = FALSE;
19020     }
19021
19022     /* All possible optimizations below still have these characteristics.
19023      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
19024      * routine) */
19025     *flagp |= HASWIDTH|SIMPLE;
19026
19027     if (ret_invlist) {
19028         *ret_invlist = cp_list;
19029
19030         return (cp_list) ? RExC_emit : 0;
19031     }
19032
19033     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19034         RExC_contains_locale = 1;
19035     }
19036
19037     /* Some character classes are equivalent to other nodes.  Such nodes take
19038      * up less room, and some nodes require fewer operations to execute, than
19039      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
19040      * improve efficiency. */
19041
19042     if (optimizable) {
19043         PERL_UINT_FAST8_T i;
19044         UV partial_cp_count = 0;
19045         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19046         UV   end[MAX_FOLD_FROMS+1] = { 0 };
19047         bool single_range = FALSE;
19048
19049         if (cp_list) { /* Count the code points in enough ranges that we would
19050                           see all the ones possible in any fold in this version
19051                           of Unicode */
19052
19053             invlist_iterinit(cp_list);
19054             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19055                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19056                     break;
19057                 }
19058                 partial_cp_count += end[i] - start[i] + 1;
19059             }
19060
19061             if (i == 1) {
19062                 single_range = TRUE;
19063             }
19064             invlist_iterfinish(cp_list);
19065         }
19066
19067         /* If we know at compile time that this matches every possible code
19068          * point, any run-time dependencies don't matter */
19069         if (start[0] == 0 && end[0] == UV_MAX) {
19070             if (invert) {
19071                 ret = reganode(pRExC_state, OPFAIL, 0);
19072             }
19073             else {
19074                 ret = reg_node(pRExC_state, SANY);
19075                 MARK_NAUGHTY(1);
19076             }
19077             goto not_anyof;
19078         }
19079
19080         /* Similarly, for /l posix classes, if both a class and its
19081          * complement match, any run-time dependencies don't matter */
19082         if (posixl) {
19083             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19084                                                         namedclass += 2)
19085             {
19086                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
19087                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19088                 {
19089                     if (invert) {
19090                         ret = reganode(pRExC_state, OPFAIL, 0);
19091                     }
19092                     else {
19093                         ret = reg_node(pRExC_state, SANY);
19094                         MARK_NAUGHTY(1);
19095                     }
19096                     goto not_anyof;
19097                 }
19098             }
19099
19100             /* For well-behaved locales, some classes are subsets of others,
19101              * so complementing the subset and including the non-complemented
19102              * superset should match everything, like [\D[:alnum:]], and
19103              * [[:^alpha:][:alnum:]], but some implementations of locales are
19104              * buggy, and khw thinks its a bad idea to have optimization change
19105              * behavior, even if it avoids an OS bug in a given case */
19106
19107 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19108
19109             /* If is a single posix /l class, can optimize to just that op.
19110              * Such a node will not match anything in the Latin1 range, as that
19111              * is not determinable until runtime, but will match whatever the
19112              * class does outside that range.  (Note that some classes won't
19113              * match anything outside the range, like [:ascii:]) */
19114             if (    isSINGLE_BIT_SET(posixl)
19115                 && (partial_cp_count == 0 || start[0] > 255))
19116             {
19117                 U8 classnum;
19118                 SV * class_above_latin1 = NULL;
19119                 bool already_inverted;
19120                 bool are_equivalent;
19121
19122                 /* Compute which bit is set, which is the same thing as, e.g.,
19123                  * ANYOF_CNTRL.  From
19124                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19125                  * */
19126                 static const int MultiplyDeBruijnBitPosition2[32] =
19127                     {
19128                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19129                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19130                     };
19131
19132                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19133                                                           * 0x077CB531U) >> 27];
19134                 classnum = namedclass_to_classnum(namedclass);
19135
19136                 /* The named classes are such that the inverted number is one
19137                  * larger than the non-inverted one */
19138                 already_inverted = namedclass
19139                                  - classnum_to_namedclass(classnum);
19140
19141                 /* Create an inversion list of the official property, inverted
19142                  * if the constructed node list is inverted, and restricted to
19143                  * only the above latin1 code points, which are the only ones
19144                  * known at compile time */
19145                 _invlist_intersection_maybe_complement_2nd(
19146                                                     PL_AboveLatin1,
19147                                                     PL_XPosix_ptrs[classnum],
19148                                                     already_inverted,
19149                                                     &class_above_latin1);
19150                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19151                                                                         FALSE);
19152                 SvREFCNT_dec_NN(class_above_latin1);
19153
19154                 if (are_equivalent) {
19155
19156                     /* Resolve the run-time inversion flag with this possibly
19157                      * inverted class */
19158                     invert = invert ^ already_inverted;
19159
19160                     ret = reg_node(pRExC_state,
19161                                    POSIXL + invert * (NPOSIXL - POSIXL));
19162                     FLAGS(REGNODE_p(ret)) = classnum;
19163                     goto not_anyof;
19164                 }
19165             }
19166         }
19167
19168         /* khw can't think of any other possible transformation involving
19169          * these. */
19170         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19171             goto is_anyof;
19172         }
19173
19174         if (! has_runtime_dependency) {
19175
19176             /* If the list is empty, nothing matches.  This happens, for
19177              * example, when a Unicode property that doesn't match anything is
19178              * the only element in the character class (perluniprops.pod notes
19179              * such properties). */
19180             if (partial_cp_count == 0) {
19181                 if (invert) {
19182                     ret = reg_node(pRExC_state, SANY);
19183                 }
19184                 else {
19185                     ret = reganode(pRExC_state, OPFAIL, 0);
19186                 }
19187
19188                 goto not_anyof;
19189             }
19190
19191             /* If matches everything but \n */
19192             if (   start[0] == 0 && end[0] == '\n' - 1
19193                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19194             {
19195                 assert (! invert);
19196                 ret = reg_node(pRExC_state, REG_ANY);
19197                 MARK_NAUGHTY(1);
19198                 goto not_anyof;
19199             }
19200         }
19201
19202         /* Next see if can optimize classes that contain just a few code points
19203          * into an EXACTish node.  The reason to do this is to let the
19204          * optimizer join this node with adjacent EXACTish ones, and ANYOF
19205          * nodes require conversion to code point from UTF-8.
19206          *
19207          * An EXACTFish node can be generated even if not under /i, and vice
19208          * versa.  But care must be taken.  An EXACTFish node has to be such
19209          * that it only matches precisely the code points in the class, but we
19210          * want to generate the least restrictive one that does that, to
19211          * increase the odds of being able to join with an adjacent node.  For
19212          * example, if the class contains [kK], we have to make it an EXACTFAA
19213          * node to prevent the KELVIN SIGN from matching.  Whether we are under
19214          * /i or not is irrelevant in this case.  Less obvious is the pattern
19215          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
19216          * supposed to match the single character U+0149 LATIN SMALL LETTER N
19217          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
19218          * that includes \X{02BC}, there is a multi-char fold that does, and so
19219          * the node generated for it must be an EXACTFish one.  On the other
19220          * hand qr/:/i should generate a plain EXACT node since the colon
19221          * participates in no fold whatsoever, and having it EXACT tells the
19222          * optimizer the target string cannot match unless it has a colon in
19223          * it.
19224          */
19225         if (   ! posixl
19226             && ! invert
19227
19228                 /* Only try if there are no more code points in the class than
19229                  * in the max possible fold */
19230             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19231         {
19232             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19233             {
19234                 /* We can always make a single code point class into an
19235                  * EXACTish node. */
19236
19237                 if (LOC) {
19238
19239                     /* Here is /l:  Use EXACTL, except if there is a fold not
19240                      * known until runtime so shows as only a single code point
19241                      * here.  For code points above 255, we know which can
19242                      * cause problems by having a potential fold to the Latin1
19243                      * range. */
19244                     if (  ! FOLD
19245                         || (     start[0] > 255
19246                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19247                     {
19248                         op = EXACTL;
19249                     }
19250                     else {
19251                         op = EXACTFL;
19252                     }
19253                 }
19254                 else if (! FOLD) { /* Not /l and not /i */
19255                     op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19256                 }
19257                 else if (start[0] < 256) { /* /i, not /l, and the code point is
19258                                               small */
19259
19260                     /* Under /i, it gets a little tricky.  A code point that
19261                      * doesn't participate in a fold should be an EXACT node.
19262                      * We know this one isn't the result of a simple fold, or
19263                      * there'd be more than one code point in the list, but it
19264                      * could be part of a multi- character fold.  In that case
19265                      * we better not create an EXACT node, as we would wrongly
19266                      * be telling the optimizer that this code point must be in
19267                      * the target string, and that is wrong.  This is because
19268                      * if the sequence around this code point forms a
19269                      * multi-char fold, what needs to be in the string could be
19270                      * the code point that folds to the sequence.
19271                      *
19272                      * This handles the case of below-255 code points, as we
19273                      * have an easy look up for those.  The next clause handles
19274                      * the above-256 one */
19275                     op = IS_IN_SOME_FOLD_L1(start[0])
19276                          ? EXACTFU
19277                          : EXACT;
19278                 }
19279                 else {  /* /i, larger code point.  Since we are under /i, and
19280                            have just this code point, we know that it can't
19281                            fold to something else, so PL_InMultiCharFold
19282                            applies to it */
19283                     op = _invlist_contains_cp(PL_InMultiCharFold,
19284                                               start[0])
19285                          ? EXACTFU_REQ8
19286                          : EXACT_REQ8;
19287                 }
19288
19289                 value = start[0];
19290             }
19291             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19292                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
19293             {
19294                 /* Here, the only runtime dependency, if any, is from /d, and
19295                  * the class matches more than one code point, and the lowest
19296                  * code point participates in some fold.  It might be that the
19297                  * other code points are /i equivalent to this one, and hence
19298                  * they would representable by an EXACTFish node.  Above, we
19299                  * eliminated classes that contain too many code points to be
19300                  * EXACTFish, with the test for MAX_FOLD_FROMS
19301                  *
19302                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
19303                  * We do this because we have EXACTFAA at our disposal for the
19304                  * ASCII range */
19305                 if (partial_cp_count == 2 && isASCII(start[0])) {
19306
19307                     /* The only ASCII characters that participate in folds are
19308                      * alphabetics */
19309                     assert(isALPHA(start[0]));
19310                     if (   end[0] == start[0]   /* First range is a single
19311                                                    character, so 2nd exists */
19312                         && isALPHA_FOLD_EQ(start[0], start[1]))
19313                     {
19314
19315                         /* Here, is part of an ASCII fold pair */
19316
19317                         if (   ASCII_FOLD_RESTRICTED
19318                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19319                         {
19320                             /* If the second clause just above was true, it
19321                              * means we can't be under /i, or else the list
19322                              * would have included more than this fold pair.
19323                              * Therefore we have to exclude the possibility of
19324                              * whatever else it is that folds to these, by
19325                              * using EXACTFAA */
19326                             op = EXACTFAA;
19327                         }
19328                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19329
19330                             /* Here, there's no simple fold that start[0] is part
19331                              * of, but there is a multi-character one.  If we
19332                              * are not under /i, we want to exclude that
19333                              * possibility; if under /i, we want to include it
19334                              * */
19335                             op = (FOLD) ? EXACTFU : EXACTFAA;
19336                         }
19337                         else {
19338
19339                             /* Here, the only possible fold start[0] particpates in
19340                              * is with start[1].  /i or not isn't relevant */
19341                             op = EXACTFU;
19342                         }
19343
19344                         value = toFOLD(start[0]);
19345                     }
19346                 }
19347                 else if (  ! upper_latin1_only_utf8_matches
19348                          || (   _invlist_len(upper_latin1_only_utf8_matches)
19349                                                                           == 2
19350                              && PL_fold_latin1[
19351                                invlist_highest(upper_latin1_only_utf8_matches)]
19352                              == start[0]))
19353                 {
19354                     /* Here, the smallest character is non-ascii or there are
19355                      * more than 2 code points matched by this node.  Also, we
19356                      * either don't have /d UTF-8 dependent matches, or if we
19357                      * do, they look like they could be a single character that
19358                      * is the fold of the lowest one in the always-match list.
19359                      * This test quickly excludes most of the false positives
19360                      * when there are /d UTF-8 depdendent matches.  These are
19361                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19362                      * SMALL LETTER A WITH GRAVE iff the target string is
19363                      * UTF-8.  (We don't have to worry above about exceeding
19364                      * the array bounds of PL_fold_latin1[] because any code
19365                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
19366                      *
19367                      * EXACTFAA would apply only to pairs (hence exactly 2 code
19368                      * points) in the ASCII range, so we can't use it here to
19369                      * artificially restrict the fold domain, so we check if
19370                      * the class does or does not match some EXACTFish node.
19371                      * Further, if we aren't under /i, and the folded-to
19372                      * character is part of a multi-character fold, we can't do
19373                      * this optimization, as the sequence around it could be
19374                      * that multi-character fold, and we don't here know the
19375                      * context, so we have to assume it is that multi-char
19376                      * fold, to prevent potential bugs.
19377                      *
19378                      * To do the general case, we first find the fold of the
19379                      * lowest code point (which may be higher than the lowest
19380                      * one), then find everything that folds to it.  (The data
19381                      * structure we have only maps from the folded code points,
19382                      * so we have to do the earlier step.) */
19383
19384                     Size_t foldlen;
19385                     U8 foldbuf[UTF8_MAXBYTES_CASE];
19386                     UV folded = _to_uni_fold_flags(start[0],
19387                                                         foldbuf, &foldlen, 0);
19388                     U32 first_fold;
19389                     const U32 * remaining_folds;
19390                     Size_t folds_to_this_cp_count = _inverse_folds(
19391                                                             folded,
19392                                                             &first_fold,
19393                                                             &remaining_folds);
19394                     Size_t folds_count = folds_to_this_cp_count + 1;
19395                     SV * fold_list = _new_invlist(folds_count);
19396                     unsigned int i;
19397
19398                     /* If there are UTF-8 dependent matches, create a temporary
19399                      * list of what this node matches, including them. */
19400                     SV * all_cp_list = NULL;
19401                     SV ** use_this_list = &cp_list;
19402
19403                     if (upper_latin1_only_utf8_matches) {
19404                         all_cp_list = _new_invlist(0);
19405                         use_this_list = &all_cp_list;
19406                         _invlist_union(cp_list,
19407                                        upper_latin1_only_utf8_matches,
19408                                        use_this_list);
19409                     }
19410
19411                     /* Having gotten everything that participates in the fold
19412                      * containing the lowest code point, we turn that into an
19413                      * inversion list, making sure everything is included. */
19414                     fold_list = add_cp_to_invlist(fold_list, start[0]);
19415                     fold_list = add_cp_to_invlist(fold_list, folded);
19416                     if (folds_to_this_cp_count > 0) {
19417                         fold_list = add_cp_to_invlist(fold_list, first_fold);
19418                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19419                             fold_list = add_cp_to_invlist(fold_list,
19420                                                         remaining_folds[i]);
19421                         }
19422                     }
19423
19424                     /* If the fold list is identical to what's in this ANYOF
19425                      * node, the node can be represented by an EXACTFish one
19426                      * instead */
19427                     if (_invlistEQ(*use_this_list, fold_list,
19428                                    0 /* Don't complement */ )
19429                     ) {
19430
19431                         /* But, we have to be careful, as mentioned above.
19432                          * Just the right sequence of characters could match
19433                          * this if it is part of a multi-character fold.  That
19434                          * IS what we want if we are under /i.  But it ISN'T
19435                          * what we want if not under /i, as it could match when
19436                          * it shouldn't.  So, when we aren't under /i and this
19437                          * character participates in a multi-char fold, we
19438                          * don't optimize into an EXACTFish node.  So, for each
19439                          * case below we have to check if we are folding
19440                          * and if not, if it is not part of a multi-char fold.
19441                          * */
19442                         if (start[0] > 255) {    /* Highish code point */
19443                             if (FOLD || ! _invlist_contains_cp(
19444                                             PL_InMultiCharFold, folded))
19445                             {
19446                                 op = (LOC)
19447                                      ? EXACTFLU8
19448                                      : (ASCII_FOLD_RESTRICTED)
19449                                        ? EXACTFAA
19450                                        : EXACTFU_REQ8;
19451                                 value = folded;
19452                             }
19453                         }   /* Below, the lowest code point < 256 */
19454                         else if (    FOLD
19455                                  &&  folded == 's'
19456                                  &&  DEPENDS_SEMANTICS)
19457                         {   /* An EXACTF node containing a single character
19458                                 's', can be an EXACTFU if it doesn't get
19459                                 joined with an adjacent 's' */
19460                             op = EXACTFU_S_EDGE;
19461                             value = folded;
19462                         }
19463                         else if (    FOLD
19464                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19465                         {
19466                             if (upper_latin1_only_utf8_matches) {
19467                                 op = EXACTF;
19468
19469                                 /* We can't use the fold, as that only matches
19470                                  * under UTF-8 */
19471                                 value = start[0];
19472                             }
19473                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
19474                                      && ! UTF)
19475                             {   /* EXACTFUP is a special node for this
19476                                    character */
19477                                 op = (ASCII_FOLD_RESTRICTED)
19478                                      ? EXACTFAA
19479                                      : EXACTFUP;
19480                                 value = MICRO_SIGN;
19481                             }
19482                             else if (     ASCII_FOLD_RESTRICTED
19483                                      && ! isASCII(start[0]))
19484                             {   /* For ASCII under /iaa, we can use EXACTFU
19485                                    below */
19486                                 op = EXACTFAA;
19487                                 value = folded;
19488                             }
19489                             else {
19490                                 op = EXACTFU;
19491                                 value = folded;
19492                             }
19493                         }
19494                     }
19495
19496                     SvREFCNT_dec_NN(fold_list);
19497                     SvREFCNT_dec(all_cp_list);
19498                 }
19499             }
19500
19501             if (op != END) {
19502                 U8 len;
19503
19504                 /* Here, we have calculated what EXACTish node to use.  Have to
19505                  * convert to UTF-8 if not already there */
19506                 if (value > 255) {
19507                     if (! UTF) {
19508                         SvREFCNT_dec(cp_list);;
19509                         REQUIRE_UTF8(flagp);
19510                     }
19511
19512                     /* This is a kludge to the special casing issues with this
19513                      * ligature under /aa.  FB05 should fold to FB06, but the
19514                      * call above to _to_uni_fold_flags() didn't find this, as
19515                      * it didn't use the /aa restriction in order to not miss
19516                      * other folds that would be affected.  This is the only
19517                      * instance likely to ever be a problem in all of Unicode.
19518                      * So special case it. */
19519                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
19520                         && ASCII_FOLD_RESTRICTED)
19521                     {
19522                         value = LATIN_SMALL_LIGATURE_ST;
19523                     }
19524                 }
19525
19526                 len = (UTF) ? UVCHR_SKIP(value) : 1;
19527
19528                 ret = regnode_guts(pRExC_state, op, len, "exact");
19529                 FILL_NODE(ret, op);
19530                 RExC_emit += 1 + STR_SZ(len);
19531                 setSTR_LEN(REGNODE_p(ret), len);
19532                 if (len == 1) {
19533                     *STRINGs(REGNODE_p(ret)) = (U8) value;
19534                 }
19535                 else {
19536                     uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19537                 }
19538                 goto not_anyof;
19539             }
19540         }
19541
19542         if (! has_runtime_dependency) {
19543
19544             /* See if this can be turned into an ANYOFM node.  Think about the
19545              * bit patterns in two different bytes.  In some positions, the
19546              * bits in each will be 1; and in other positions both will be 0;
19547              * and in some positions the bit will be 1 in one byte, and 0 in
19548              * the other.  Let 'n' be the number of positions where the bits
19549              * differ.  We create a mask which has exactly 'n' 0 bits, each in
19550              * a position where the two bytes differ.  Now take the set of all
19551              * bytes that when ANDed with the mask yield the same result.  That
19552              * set has 2**n elements, and is representable by just two 8 bit
19553              * numbers: the result and the mask.  Importantly, matching the set
19554              * can be vectorized by creating a word full of the result bytes,
19555              * and a word full of the mask bytes, yielding a significant speed
19556              * up.  Here, see if this node matches such a set.  As a concrete
19557              * example consider [01], and the byte representing '0' which is
19558              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
19559              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
19560              * 0x30.  Any other bytes ANDed yield something else.  So [01],
19561              * which is a common usage, is optimizable into ANYOFM, and can
19562              * benefit from the speed up.  We can only do this on UTF-8
19563              * invariant bytes, because they have the same bit patterns under
19564              * UTF-8 as not. */
19565             PERL_UINT_FAST8_T inverted = 0;
19566 #ifdef EBCDIC
19567             const PERL_UINT_FAST8_T max_permissible = 0xFF;
19568 #else
19569             const PERL_UINT_FAST8_T max_permissible = 0x7F;
19570 #endif
19571             /* If doesn't fit the criteria for ANYOFM, invert and try again.
19572              * If that works we will instead later generate an NANYOFM, and
19573              * invert back when through */
19574             if (invlist_highest(cp_list) > max_permissible) {
19575                 _invlist_invert(cp_list);
19576                 inverted = 1;
19577             }
19578
19579             if (invlist_highest(cp_list) <= max_permissible) {
19580                 UV this_start, this_end;
19581                 UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
19582                 U8 bits_differing = 0;
19583                 Size_t full_cp_count = 0;
19584                 bool first_time = TRUE;
19585
19586                 /* Go through the bytes and find the bit positions that differ
19587                  * */
19588                 invlist_iterinit(cp_list);
19589                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19590                     unsigned int i = this_start;
19591
19592                     if (first_time) {
19593                         if (! UVCHR_IS_INVARIANT(i)) {
19594                             goto done_anyofm;
19595                         }
19596
19597                         first_time = FALSE;
19598                         lowest_cp = this_start;
19599
19600                         /* We have set up the code point to compare with.
19601                          * Don't compare it with itself */
19602                         i++;
19603                     }
19604
19605                     /* Find the bit positions that differ from the lowest code
19606                      * point in the node.  Keep track of all such positions by
19607                      * OR'ing */
19608                     for (; i <= this_end; i++) {
19609                         if (! UVCHR_IS_INVARIANT(i)) {
19610                             goto done_anyofm;
19611                         }
19612
19613                         bits_differing  |= i ^ lowest_cp;
19614                     }
19615
19616                     full_cp_count += this_end - this_start + 1;
19617                 }
19618
19619                 /* At the end of the loop, we count how many bits differ from
19620                  * the bits in lowest code point, call the count 'd'.  If the
19621                  * set we found contains 2**d elements, it is the closure of
19622                  * all code points that differ only in those bit positions.  To
19623                  * convince yourself of that, first note that the number in the
19624                  * closure must be a power of 2, which we test for.  The only
19625                  * way we could have that count and it be some differing set,
19626                  * is if we got some code points that don't differ from the
19627                  * lowest code point in any position, but do differ from each
19628                  * other in some other position.  That means one code point has
19629                  * a 1 in that position, and another has a 0.  But that would
19630                  * mean that one of them differs from the lowest code point in
19631                  * that position, which possibility we've already excluded.  */
19632                 if (  (inverted || full_cp_count > 1)
19633                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19634                 {
19635                     U8 ANYOFM_mask;
19636
19637                     op = ANYOFM + inverted;;
19638
19639                     /* We need to make the bits that differ be 0's */
19640                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19641
19642                     /* The argument is the lowest code point */
19643                     ret = reganode(pRExC_state, op, lowest_cp);
19644                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19645                 }
19646
19647               done_anyofm:
19648                 invlist_iterfinish(cp_list);
19649             }
19650
19651             if (inverted) {
19652                 _invlist_invert(cp_list);
19653             }
19654
19655             if (op != END) {
19656                 goto not_anyof;
19657             }
19658
19659             /* XXX We could create an ANYOFR_LOW node here if we saved above if
19660              * all were invariants, it wasn't inverted, and there is a single
19661              * range.  This would be faster than some of the posix nodes we
19662              * create below like /\d/a, but would be twice the size.  Without
19663              * having actually measured the gain, khw doesn't think the
19664              * tradeoff is really worth it */
19665         }
19666
19667         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19668             PERL_UINT_FAST8_T type;
19669             SV * intersection = NULL;
19670             SV* d_invlist = NULL;
19671
19672             /* See if this matches any of the POSIX classes.  The POSIXA and
19673              * POSIXD ones are about the same speed as ANYOF ops, but take less
19674              * room; the ones that have above-Latin1 code point matches are
19675              * somewhat faster than ANYOF.  */
19676
19677             for (type = POSIXA; type >= POSIXD; type--) {
19678                 int posix_class;
19679
19680                 if (type == POSIXL) {   /* But not /l posix classes */
19681                     continue;
19682                 }
19683
19684                 for (posix_class = 0;
19685                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19686                      posix_class++)
19687                 {
19688                     SV** our_code_points = &cp_list;
19689                     SV** official_code_points;
19690                     int try_inverted;
19691
19692                     if (type == POSIXA) {
19693                         official_code_points = &PL_Posix_ptrs[posix_class];
19694                     }
19695                     else {
19696                         official_code_points = &PL_XPosix_ptrs[posix_class];
19697                     }
19698
19699                     /* Skip non-existent classes of this type.  e.g. \v only
19700                      * has an entry in PL_XPosix_ptrs */
19701                     if (! *official_code_points) {
19702                         continue;
19703                     }
19704
19705                     /* Try both the regular class, and its inversion */
19706                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19707                         bool this_inverted = invert ^ try_inverted;
19708
19709                         if (type != POSIXD) {
19710
19711                             /* This class that isn't /d can't match if we have
19712                              * /d dependencies */
19713                             if (has_runtime_dependency
19714                                                     & HAS_D_RUNTIME_DEPENDENCY)
19715                             {
19716                                 continue;
19717                             }
19718                         }
19719                         else /* is /d */ if (! this_inverted) {
19720
19721                             /* /d classes don't match anything non-ASCII below
19722                              * 256 unconditionally (which cp_list contains) */
19723                             _invlist_intersection(cp_list, PL_UpperLatin1,
19724                                                            &intersection);
19725                             if (_invlist_len(intersection) != 0) {
19726                                 continue;
19727                             }
19728
19729                             SvREFCNT_dec(d_invlist);
19730                             d_invlist = invlist_clone(cp_list, NULL);
19731
19732                             /* But under UTF-8 it turns into using /u rules.
19733                              * Add the things it matches under these conditions
19734                              * so that we check below that these are identical
19735                              * to what the tested class should match */
19736                             if (upper_latin1_only_utf8_matches) {
19737                                 _invlist_union(
19738                                             d_invlist,
19739                                             upper_latin1_only_utf8_matches,
19740                                             &d_invlist);
19741                             }
19742                             our_code_points = &d_invlist;
19743                         }
19744                         else {  /* POSIXD, inverted.  If this doesn't have this
19745                                    flag set, it isn't /d. */
19746                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19747                             {
19748                                 continue;
19749                             }
19750                             our_code_points = &cp_list;
19751                         }
19752
19753                         /* Here, have weeded out some things.  We want to see
19754                          * if the list of characters this node contains
19755                          * ('*our_code_points') precisely matches those of the
19756                          * class we are currently checking against
19757                          * ('*official_code_points'). */
19758                         if (_invlistEQ(*our_code_points,
19759                                        *official_code_points,
19760                                        try_inverted))
19761                         {
19762                             /* Here, they precisely match.  Optimize this ANYOF
19763                              * node into its equivalent POSIX one of the
19764                              * correct type, possibly inverted */
19765                             ret = reg_node(pRExC_state, (try_inverted)
19766                                                         ? type + NPOSIXA
19767                                                                 - POSIXA
19768                                                         : type);
19769                             FLAGS(REGNODE_p(ret)) = posix_class;
19770                             SvREFCNT_dec(d_invlist);
19771                             SvREFCNT_dec(intersection);
19772                             goto not_anyof;
19773                         }
19774                     }
19775                 }
19776             }
19777             SvREFCNT_dec(d_invlist);
19778             SvREFCNT_dec(intersection);
19779         }
19780
19781         /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19782          * both in size and speed.  Currently, a 20 bit range base (smallest
19783          * code point in the range), and a 12 bit maximum delta are packed into
19784          * a 32 bit word.  This allows for using it on all of the Unicode code
19785          * points except for the highest plane, which is only for private use
19786          * code points.  khw doubts that a bigger delta is likely in real world
19787          * applications */
19788         if (     single_range
19789             && ! has_runtime_dependency
19790             &&   anyof_flags == 0
19791             &&   start[0] < (1 << ANYOFR_BASE_BITS)
19792             &&   end[0] - start[0]
19793                     < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19794                                    * CHARBITS - ANYOFR_BASE_BITS))))
19795
19796         {
19797             U8 low_utf8[UTF8_MAXBYTES+1];
19798             U8 high_utf8[UTF8_MAXBYTES+1];
19799
19800             ret = reganode(pRExC_state, ANYOFR,
19801                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19802
19803             /* Place the lowest UTF-8 start byte in the flags field, so as to
19804              * allow efficient ruling out at run time of many possible inputs.
19805              * */
19806             (void) uvchr_to_utf8(low_utf8, start[0]);
19807             (void) uvchr_to_utf8(high_utf8, end[0]);
19808
19809             /* If all code points share the same first byte, this can be an
19810              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
19811              * quickly rule out many inputs at run-time without having to
19812              * compute the code point from UTF-8.  For EBCDIC, we use I8, as
19813              * not doing that transformation would not rule out nearly so many
19814              * things */
19815             if (low_utf8[0] == high_utf8[0]) {
19816                 OP(REGNODE_p(ret)) = ANYOFRb;
19817                 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19818             }
19819             else {
19820                 ANYOF_FLAGS(REGNODE_p(ret))
19821                                     = NATIVE_UTF8_TO_I8(low_utf8[0]);
19822             }
19823
19824             goto not_anyof;
19825         }
19826
19827         /* If didn't find an optimization and there is no need for a bitmap,
19828          * optimize to indicate that */
19829         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19830             && ! LOC
19831             && ! upper_latin1_only_utf8_matches
19832             &&   anyof_flags == 0)
19833         {
19834             U8 low_utf8[UTF8_MAXBYTES+1];
19835             UV highest_cp = invlist_highest(cp_list);
19836
19837             /* Currently the maximum allowed code point by the system is
19838              * IV_MAX.  Higher ones are reserved for future internal use.  This
19839              * particular regnode can be used for higher ones, but we can't
19840              * calculate the code point of those.  IV_MAX suffices though, as
19841              * it will be a large first byte */
19842             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19843                            - low_utf8;
19844
19845             /* We store the lowest possible first byte of the UTF-8
19846              * representation, using the flags field.  This allows for quick
19847              * ruling out of some inputs without having to convert from UTF-8
19848              * to code point.  For EBCDIC, we use I8, as not doing that
19849              * transformation would not rule out nearly so many things */
19850             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19851
19852             op = ANYOFH;
19853
19854             /* If the first UTF-8 start byte for the highest code point in the
19855              * range is suitably small, we may be able to get an upper bound as
19856              * well */
19857             if (highest_cp <= IV_MAX) {
19858                 U8 high_utf8[UTF8_MAXBYTES+1];
19859                 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19860                                 - high_utf8;
19861
19862                 /* If the lowest and highest are the same, we can get an exact
19863                  * first byte instead of a just minimum or even a sequence of
19864                  * exact leading bytes.  We signal these with different
19865                  * regnodes */
19866                 if (low_utf8[0] == high_utf8[0]) {
19867                     Size_t len = find_first_differing_byte_pos(low_utf8,
19868                                                                high_utf8,
19869                                                        MIN(low_len, high_len));
19870
19871                     if (len == 1) {
19872
19873                         /* No need to convert to I8 for EBCDIC as this is an
19874                          * exact match */
19875                         anyof_flags = low_utf8[0];
19876                         op = ANYOFHb;
19877                     }
19878                     else {
19879                         op = ANYOFHs;
19880                         ret = regnode_guts(pRExC_state, op,
19881                                            regarglen[op] + STR_SZ(len),
19882                                            "anyofhs");
19883                         FILL_NODE(ret, op);
19884                         ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19885                                                                         = len;
19886                         Copy(low_utf8,  /* Add the common bytes */
19887                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19888                            len, U8);
19889                         RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19890                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19891                                                   NULL, only_utf8_locale_list);
19892                         goto not_anyof;
19893                     }
19894                 }
19895                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19896                 {
19897
19898                     /* Here, the high byte is not the same as the low, but is
19899                      * small enough that its reasonable to have a loose upper
19900                      * bound, which is packed in with the strict lower bound.
19901                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19902                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
19903                      * is the same thing as UTF-8 */
19904
19905                     U8 bits = 0;
19906                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19907                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19908                                   - anyof_flags;
19909
19910                     if (range_diff <= max_range_diff / 8) {
19911                         bits = 3;
19912                     }
19913                     else if (range_diff <= max_range_diff / 4) {
19914                         bits = 2;
19915                     }
19916                     else if (range_diff <= max_range_diff / 2) {
19917                         bits = 1;
19918                     }
19919                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19920                     op = ANYOFHr;
19921                 }
19922             }
19923
19924             goto done_finding_op;
19925         }
19926     }   /* End of seeing if can optimize it into a different node */
19927
19928   is_anyof: /* It's going to be an ANYOF node. */
19929     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19930          ? ANYOFD
19931          : ((posixl)
19932             ? ANYOFPOSIXL
19933             : ((LOC)
19934                ? ANYOFL
19935                : ANYOF));
19936
19937   done_finding_op:
19938
19939     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19940     FILL_NODE(ret, op);        /* We set the argument later */
19941     RExC_emit += 1 + regarglen[op];
19942     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19943
19944     /* Here, <cp_list> contains all the code points we can determine at
19945      * compile time that match under all conditions.  Go through it, and
19946      * for things that belong in the bitmap, put them there, and delete from
19947      * <cp_list>.  While we are at it, see if everything above 255 is in the
19948      * list, and if so, set a flag to speed up execution */
19949
19950     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19951
19952     if (posixl) {
19953         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19954     }
19955
19956     if (invert) {
19957         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19958     }
19959
19960     /* Here, the bitmap has been populated with all the Latin1 code points that
19961      * always match.  Can now add to the overall list those that match only
19962      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19963      * */
19964     if (upper_latin1_only_utf8_matches) {
19965         if (cp_list) {
19966             _invlist_union(cp_list,
19967                            upper_latin1_only_utf8_matches,
19968                            &cp_list);
19969             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19970         }
19971         else {
19972             cp_list = upper_latin1_only_utf8_matches;
19973         }
19974         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19975     }
19976
19977     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19978                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19979                    ? listsv
19980                    : NULL,
19981                   only_utf8_locale_list);
19982     SvREFCNT_dec(cp_list);;
19983     SvREFCNT_dec(only_utf8_locale_list);
19984     return ret;
19985
19986   not_anyof:
19987
19988     /* Here, the node is getting optimized into something that's not an ANYOF
19989      * one.  Finish up. */
19990
19991     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19992                                            RExC_parse - orig_parse);;
19993     SvREFCNT_dec(cp_list);;
19994     SvREFCNT_dec(only_utf8_locale_list);
19995     return ret;
19996 }
19997
19998 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19999
20000 STATIC void
20001 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
20002                 regnode* const node,
20003                 SV* const cp_list,
20004                 SV* const runtime_defns,
20005                 SV* const only_utf8_locale_list)
20006 {
20007     /* Sets the arg field of an ANYOF-type node 'node', using information about
20008      * the node passed-in.  If there is nothing outside the node's bitmap, the
20009      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
20010      * the count returned by add_data(), having allocated and stored an array,
20011      * av, as follows:
20012      *
20013      *  av[0] stores the inversion list defining this class as far as known at
20014      *        this time, or PL_sv_undef if nothing definite is now known.
20015      *  av[1] stores the inversion list of code points that match only if the
20016      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
20017      *        av[2], or no entry otherwise.
20018      *  av[2] stores the list of user-defined properties whose subroutine
20019      *        definitions aren't known at this time, or no entry if none. */
20020
20021     UV n;
20022
20023     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
20024
20025     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
20026         assert(! (ANYOF_FLAGS(node)
20027                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
20028         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
20029     }
20030     else {
20031         AV * const av = newAV();
20032         SV *rv;
20033
20034         if (cp_list) {
20035             av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20036         }
20037
20038         /* (Note that if any of this changes, the size calculations in
20039          * S_optimize_regclass() might need to be updated.) */
20040
20041         if (only_utf8_locale_list) {
20042             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20043                                      SvREFCNT_inc_NN(only_utf8_locale_list));
20044         }
20045
20046         if (runtime_defns) {
20047             av_store(av, DEFERRED_USER_DEFINED_INDEX,
20048                          SvREFCNT_inc_NN(runtime_defns));
20049         }
20050
20051         rv = newRV_noinc(MUTABLE_SV(av));
20052         n = add_data(pRExC_state, STR_WITH_LEN("s"));
20053         RExC_rxi->data->data[n] = (void*)rv;
20054         ARG_SET(node, n);
20055     }
20056 }
20057
20058 SV *
20059
20060 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20061 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20062 #else
20063 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20064 #endif
20065
20066 {
20067     /* For internal core use only.
20068      * Returns the inversion list for the input 'node' in the regex 'prog'.
20069      * If <doinit> is 'true', will attempt to create the inversion list if not
20070      *    already done.
20071      * If <listsvp> is non-null, will return the printable contents of the
20072      *    property definition.  This can be used to get debugging information
20073      *    even before the inversion list exists, by calling this function with
20074      *    'doinit' set to false, in which case the components that will be used
20075      *    to eventually create the inversion list are returned  (in a printable
20076      *    form).
20077      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20078      *    store an inversion list of code points that should match only if the
20079      *    execution-time locale is a UTF-8 one.
20080      * If <output_invlist> is not NULL, it is where this routine is to store an
20081      *    inversion list of the code points that would be instead returned in
20082      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
20083      *    when this parameter is used, is just the non-code point data that
20084      *    will go into creating the inversion list.  This currently should be just
20085      *    user-defined properties whose definitions were not known at compile
20086      *    time.  Using this parameter allows for easier manipulation of the
20087      *    inversion list's data by the caller.  It is illegal to call this
20088      *    function with this parameter set, but not <listsvp>
20089      *
20090      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
20091      * that, in spite of this function's name, the inversion list it returns
20092      * may include the bitmap data as well */
20093
20094     SV *si  = NULL;         /* Input initialization string */
20095     SV* invlist = NULL;
20096
20097     RXi_GET_DECL(prog, progi);
20098     const struct reg_data * const data = prog ? progi->data : NULL;
20099
20100 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20101     PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20102 #else
20103     PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20104 #endif
20105     assert(! output_invlist || listsvp);
20106
20107     if (data && data->count) {
20108         const U32 n = ARG(node);
20109
20110         if (data->what[n] == 's') {
20111             SV * const rv = MUTABLE_SV(data->data[n]);
20112             AV * const av = MUTABLE_AV(SvRV(rv));
20113             SV **const ary = AvARRAY(av);
20114
20115             invlist = ary[INVLIST_INDEX];
20116
20117             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20118                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20119             }
20120
20121             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20122                 si = ary[DEFERRED_USER_DEFINED_INDEX];
20123             }
20124
20125             if (doinit && (si || invlist)) {
20126                 if (si) {
20127                     bool user_defined;
20128                     SV * msg = newSVpvs_flags("", SVs_TEMP);
20129
20130                     SV * prop_definition = handle_user_defined_property(
20131                             "", 0, FALSE,   /* There is no \p{}, \P{} */
20132                             SvPVX_const(si)[1] - '0',   /* /i or not has been
20133                                                            stored here for just
20134                                                            this occasion */
20135                             TRUE,           /* run time */
20136                             FALSE,          /* This call must find the defn */
20137                             si,             /* The property definition  */
20138                             &user_defined,
20139                             msg,
20140                             0               /* base level call */
20141                            );
20142
20143                     if (SvCUR(msg)) {
20144                         assert(prop_definition == NULL);
20145
20146                         Perl_croak(aTHX_ "%" UTF8f,
20147                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20148                     }
20149
20150                     if (invlist) {
20151                         _invlist_union(invlist, prop_definition, &invlist);
20152                         SvREFCNT_dec_NN(prop_definition);
20153                     }
20154                     else {
20155                         invlist = prop_definition;
20156                     }
20157
20158                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20159                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20160
20161                     ary[INVLIST_INDEX] = invlist;
20162                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20163                                  ? ONLY_LOCALE_MATCHES_INDEX
20164                                  : INVLIST_INDEX);
20165                     si = NULL;
20166                 }
20167             }
20168         }
20169     }
20170
20171     /* If requested, return a printable version of what this ANYOF node matches
20172      * */
20173     if (listsvp) {
20174         SV* matches_string = NULL;
20175
20176         /* This function can be called at compile-time, before everything gets
20177          * resolved, in which case we return the currently best available
20178          * information, which is the string that will eventually be used to do
20179          * that resolving, 'si' */
20180         if (si) {
20181             /* Here, we only have 'si' (and possibly some passed-in data in
20182              * 'invlist', which is handled below)  If the caller only wants
20183              * 'si', use that.  */
20184             if (! output_invlist) {
20185                 matches_string = newSVsv(si);
20186             }
20187             else {
20188                 /* But if the caller wants an inversion list of the node, we
20189                  * need to parse 'si' and place as much as possible in the
20190                  * desired output inversion list, making 'matches_string' only
20191                  * contain the currently unresolvable things */
20192                 const char *si_string = SvPVX(si);
20193                 STRLEN remaining = SvCUR(si);
20194                 UV prev_cp = 0;
20195                 U8 count = 0;
20196
20197                 /* Ignore everything before and including the first new-line */
20198                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20199                 assert (si_string != NULL);
20200                 si_string++;
20201                 remaining = SvPVX(si) + SvCUR(si) - si_string;
20202
20203                 while (remaining > 0) {
20204
20205                     /* The data consists of just strings defining user-defined
20206                      * property names, but in prior incarnations, and perhaps
20207                      * somehow from pluggable regex engines, it could still
20208                      * hold hex code point definitions, all of which should be
20209                      * legal (or it wouldn't have gotten this far).  Each
20210                      * component of a range would be separated by a tab, and
20211                      * each range by a new-line.  If these are found, instead
20212                      * add them to the inversion list */
20213                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
20214                                      |PERL_SCAN_SILENT_NON_PORTABLE;
20215                     STRLEN len = remaining;
20216                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20217
20218                     /* If the hex decode routine found something, it should go
20219                      * up to the next \n */
20220                     if (   *(si_string + len) == '\n') {
20221                         if (count) {    /* 2nd code point on line */
20222                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20223                         }
20224                         else {
20225                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20226                         }
20227                         count = 0;
20228                         goto prepare_for_next_iteration;
20229                     }
20230
20231                     /* If the hex decode was instead for the lower range limit,
20232                      * save it, and go parse the upper range limit */
20233                     if (*(si_string + len) == '\t') {
20234                         assert(count == 0);
20235
20236                         prev_cp = cp;
20237                         count = 1;
20238                       prepare_for_next_iteration:
20239                         si_string += len + 1;
20240                         remaining -= len + 1;
20241                         continue;
20242                     }
20243
20244                     /* Here, didn't find a legal hex number.  Just add the text
20245                      * from here up to the next \n, omitting any trailing
20246                      * markers. */
20247
20248                     remaining -= len;
20249                     len = strcspn(si_string,
20250                                         DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20251                     remaining -= len;
20252                     if (matches_string) {
20253                         sv_catpvn(matches_string, si_string, len);
20254                     }
20255                     else {
20256                         matches_string = newSVpvn(si_string, len);
20257                     }
20258                     sv_catpvs(matches_string, " ");
20259
20260                     si_string += len;
20261                     if (   remaining
20262                         && UCHARAT(si_string)
20263                                             == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20264                     {
20265                         si_string++;
20266                         remaining--;
20267                     }
20268                     if (remaining && UCHARAT(si_string) == '\n') {
20269                         si_string++;
20270                         remaining--;
20271                     }
20272                 } /* end of loop through the text */
20273
20274                 assert(matches_string);
20275                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
20276                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20277                 }
20278             } /* end of has an 'si' */
20279         }
20280
20281         /* Add the stuff that's already known */
20282         if (invlist) {
20283
20284             /* Again, if the caller doesn't want the output inversion list, put
20285              * everything in 'matches-string' */
20286             if (! output_invlist) {
20287                 if ( ! matches_string) {
20288                     matches_string = newSVpvs("\n");
20289                 }
20290                 sv_catsv(matches_string, invlist_contents(invlist,
20291                                                   TRUE /* traditional style */
20292                                                   ));
20293             }
20294             else if (! *output_invlist) {
20295                 *output_invlist = invlist_clone(invlist, NULL);
20296             }
20297             else {
20298                 _invlist_union(*output_invlist, invlist, output_invlist);
20299             }
20300         }
20301
20302         *listsvp = matches_string;
20303     }
20304
20305     return invlist;
20306 }
20307
20308 /* reg_skipcomment()
20309
20310    Absorbs an /x style # comment from the input stream,
20311    returning a pointer to the first character beyond the comment, or if the
20312    comment terminates the pattern without anything following it, this returns
20313    one past the final character of the pattern (in other words, RExC_end) and
20314    sets the REG_RUN_ON_COMMENT_SEEN flag.
20315
20316    Note it's the callers responsibility to ensure that we are
20317    actually in /x mode
20318
20319 */
20320
20321 PERL_STATIC_INLINE char*
20322 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20323 {
20324     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20325
20326     assert(*p == '#');
20327
20328     while (p < RExC_end) {
20329         if (*(++p) == '\n') {
20330             return p+1;
20331         }
20332     }
20333
20334     /* we ran off the end of the pattern without ending the comment, so we have
20335      * to add an \n when wrapping */
20336     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20337     return p;
20338 }
20339
20340 STATIC void
20341 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20342                                 char ** p,
20343                                 const bool force_to_xmod
20344                          )
20345 {
20346     /* If the text at the current parse position '*p' is a '(?#...)' comment,
20347      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20348      * is /x whitespace, advance '*p' so that on exit it points to the first
20349      * byte past all such white space and comments */
20350
20351     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20352
20353     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20354
20355     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20356
20357     for (;;) {
20358         if (RExC_end - (*p) >= 3
20359             && *(*p)     == '('
20360             && *(*p + 1) == '?'
20361             && *(*p + 2) == '#')
20362         {
20363             while (*(*p) != ')') {
20364                 if ((*p) == RExC_end)
20365                     FAIL("Sequence (?#... not terminated");
20366                 (*p)++;
20367             }
20368             (*p)++;
20369             continue;
20370         }
20371
20372         if (use_xmod) {
20373             const char * save_p = *p;
20374             while ((*p) < RExC_end) {
20375                 STRLEN len;
20376                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20377                     (*p) += len;
20378                 }
20379                 else if (*(*p) == '#') {
20380                     (*p) = reg_skipcomment(pRExC_state, (*p));
20381                 }
20382                 else {
20383                     break;
20384                 }
20385             }
20386             if (*p != save_p) {
20387                 continue;
20388             }
20389         }
20390
20391         break;
20392     }
20393
20394     return;
20395 }
20396
20397 /* nextchar()
20398
20399    Advances the parse position by one byte, unless that byte is the beginning
20400    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
20401    those two cases, the parse position is advanced beyond all such comments and
20402    white space.
20403
20404    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20405 */
20406
20407 STATIC void
20408 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20409 {
20410     PERL_ARGS_ASSERT_NEXTCHAR;
20411
20412     if (RExC_parse < RExC_end) {
20413         assert(   ! UTF
20414                || UTF8_IS_INVARIANT(*RExC_parse)
20415                || UTF8_IS_START(*RExC_parse));
20416
20417         RExC_parse += (UTF)
20418                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20419                       : 1;
20420
20421         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20422                                 FALSE /* Don't force /x */ );
20423     }
20424 }
20425
20426 STATIC void
20427 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20428 {
20429     /* 'size' is the delta number of smallest regnode equivalents to add or
20430      * subtract from the current memory allocated to the regex engine being
20431      * constructed. */
20432
20433     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20434
20435     RExC_size += size;
20436
20437     Renewc(RExC_rxi,
20438            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20439                                                 /* +1 for REG_MAGIC */
20440            char,
20441            regexp_internal);
20442     if ( RExC_rxi == NULL )
20443         FAIL("Regexp out of space");
20444     RXi_SET(RExC_rx, RExC_rxi);
20445
20446     RExC_emit_start = RExC_rxi->program;
20447     if (size > 0) {
20448         Zero(REGNODE_p(RExC_emit), size, regnode);
20449     }
20450
20451 #ifdef RE_TRACK_PATTERN_OFFSETS
20452     Renew(RExC_offsets, 2*RExC_size+1, U32);
20453     if (size > 0) {
20454         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20455     }
20456     RExC_offsets[0] = RExC_size;
20457 #endif
20458 }
20459
20460 STATIC regnode_offset
20461 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20462 {
20463     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20464      * equivalents space.  It aligns and increments RExC_size
20465      *
20466      * It returns the regnode's offset into the regex engine program */
20467
20468     const regnode_offset ret = RExC_emit;
20469
20470     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20471
20472     PERL_ARGS_ASSERT_REGNODE_GUTS;
20473
20474     SIZE_ALIGN(RExC_size);
20475     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20476     NODE_ALIGN_FILL(REGNODE_p(ret));
20477 #ifndef RE_TRACK_PATTERN_OFFSETS
20478     PERL_UNUSED_ARG(name);
20479     PERL_UNUSED_ARG(op);
20480 #else
20481     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20482
20483     if (RExC_offsets) {         /* MJD */
20484         MJD_OFFSET_DEBUG(
20485               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20486               name, __LINE__,
20487               PL_reg_name[op],
20488               (UV)(RExC_emit) > RExC_offsets[0]
20489                 ? "Overwriting end of array!\n" : "OK",
20490               (UV)(RExC_emit),
20491               (UV)(RExC_parse - RExC_start),
20492               (UV)RExC_offsets[0]));
20493         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20494     }
20495 #endif
20496     return(ret);
20497 }
20498
20499 /*
20500 - reg_node - emit a node
20501 */
20502 STATIC regnode_offset /* Location. */
20503 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20504 {
20505     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20506     regnode_offset ptr = ret;
20507
20508     PERL_ARGS_ASSERT_REG_NODE;
20509
20510     assert(regarglen[op] == 0);
20511
20512     FILL_ADVANCE_NODE(ptr, op);
20513     RExC_emit = ptr;
20514     return(ret);
20515 }
20516
20517 /*
20518 - reganode - emit a node with an argument
20519 */
20520 STATIC regnode_offset /* Location. */
20521 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20522 {
20523     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20524     regnode_offset ptr = ret;
20525
20526     PERL_ARGS_ASSERT_REGANODE;
20527
20528     /* ANYOF are special cased to allow non-length 1 args */
20529     assert(regarglen[op] == 1);
20530
20531     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20532     RExC_emit = ptr;
20533     return(ret);
20534 }
20535
20536 /*
20537 - regpnode - emit a temporary node with a SV* argument
20538 */
20539 STATIC regnode_offset /* Location. */
20540 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20541 {
20542     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20543     regnode_offset ptr = ret;
20544
20545     PERL_ARGS_ASSERT_REGPNODE;
20546
20547     FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20548     RExC_emit = ptr;
20549     return(ret);
20550 }
20551
20552 STATIC regnode_offset
20553 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20554 {
20555     /* emit a node with U32 and I32 arguments */
20556
20557     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20558     regnode_offset ptr = ret;
20559
20560     PERL_ARGS_ASSERT_REG2LANODE;
20561
20562     assert(regarglen[op] == 2);
20563
20564     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20565     RExC_emit = ptr;
20566     return(ret);
20567 }
20568
20569 /*
20570 - reginsert - insert an operator in front of already-emitted operand
20571 *
20572 * That means that on exit 'operand' is the offset of the newly inserted
20573 * operator, and the original operand has been relocated.
20574 *
20575 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20576 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20577 *
20578 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20579 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20580 *
20581 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20582 */
20583 STATIC void
20584 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20585                   const regnode_offset operand, const U32 depth)
20586 {
20587     regnode *src;
20588     regnode *dst;
20589     regnode *place;
20590     const int offset = regarglen[(U8)op];
20591     const int size = NODE_STEP_REGNODE + offset;
20592     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20593
20594     PERL_ARGS_ASSERT_REGINSERT;
20595     PERL_UNUSED_CONTEXT;
20596     PERL_UNUSED_ARG(depth);
20597 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20598     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20599     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20600                                     studying. If this is wrong then we need to adjust RExC_recurse
20601                                     below like we do with RExC_open_parens/RExC_close_parens. */
20602     change_engine_size(pRExC_state, (Ptrdiff_t) size);
20603     src = REGNODE_p(RExC_emit);
20604     RExC_emit += size;
20605     dst = REGNODE_p(RExC_emit);
20606
20607     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20608      * and [perl #133871] shows this can lead to problems, so skip this
20609      * realignment of parens until a later pass when they are reliable */
20610     if (! IN_PARENS_PASS && RExC_open_parens) {
20611         int paren;
20612         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20613         /* remember that RExC_npar is rex->nparens + 1,
20614          * iow it is 1 more than the number of parens seen in
20615          * the pattern so far. */
20616         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20617             /* note, RExC_open_parens[0] is the start of the
20618              * regex, it can't move. RExC_close_parens[0] is the end
20619              * of the regex, it *can* move. */
20620             if ( paren && RExC_open_parens[paren] >= operand ) {
20621                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20622                 RExC_open_parens[paren] += size;
20623             } else {
20624                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20625             }
20626             if ( RExC_close_parens[paren] >= operand ) {
20627                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20628                 RExC_close_parens[paren] += size;
20629             } else {
20630                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20631             }
20632         }
20633     }
20634     if (RExC_end_op)
20635         RExC_end_op += size;
20636
20637     while (src > REGNODE_p(operand)) {
20638         StructCopy(--src, --dst, regnode);
20639 #ifdef RE_TRACK_PATTERN_OFFSETS
20640         if (RExC_offsets) {     /* MJD 20010112 */
20641             MJD_OFFSET_DEBUG(
20642                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20643                   "reginsert",
20644                   __LINE__,
20645                   PL_reg_name[op],
20646                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20647                     ? "Overwriting end of array!\n" : "OK",
20648                   (UV)REGNODE_OFFSET(src),
20649                   (UV)REGNODE_OFFSET(dst),
20650                   (UV)RExC_offsets[0]));
20651             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20652             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20653         }
20654 #endif
20655     }
20656
20657     place = REGNODE_p(operand); /* Op node, where operand used to be. */
20658 #ifdef RE_TRACK_PATTERN_OFFSETS
20659     if (RExC_offsets) {         /* MJD */
20660         MJD_OFFSET_DEBUG(
20661               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20662               "reginsert",
20663               __LINE__,
20664               PL_reg_name[op],
20665               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20666               ? "Overwriting end of array!\n" : "OK",
20667               (UV)REGNODE_OFFSET(place),
20668               (UV)(RExC_parse - RExC_start),
20669               (UV)RExC_offsets[0]));
20670         Set_Node_Offset(place, RExC_parse);
20671         Set_Node_Length(place, 1);
20672     }
20673 #endif
20674     src = NEXTOPER(place);
20675     FLAGS(place) = 0;
20676     FILL_NODE(operand, op);
20677
20678     /* Zero out any arguments in the new node */
20679     Zero(src, offset, regnode);
20680 }
20681
20682 /*
20683 - regtail - set the next-pointer at the end of a node chain of p to val.  If
20684             that value won't fit in the space available, instead returns FALSE.
20685             (Except asserts if we can't fit in the largest space the regex
20686             engine is designed for.)
20687 - SEE ALSO: regtail_study
20688 */
20689 STATIC bool
20690 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20691                 const regnode_offset p,
20692                 const regnode_offset val,
20693                 const U32 depth)
20694 {
20695     regnode_offset scan;
20696     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20697
20698     PERL_ARGS_ASSERT_REGTAIL;
20699 #ifndef DEBUGGING
20700     PERL_UNUSED_ARG(depth);
20701 #endif
20702
20703     /* The final node in the chain is the first one with a nonzero next pointer
20704      * */
20705     scan = (regnode_offset) p;
20706     for (;;) {
20707         regnode * const temp = regnext(REGNODE_p(scan));
20708         DEBUG_PARSE_r({
20709             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20710             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20711             Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
20712                 SvPV_nolen_const(RExC_mysv), scan,
20713                     (temp == NULL ? "->" : ""),
20714                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20715             );
20716         });
20717         if (temp == NULL)
20718             break;
20719         scan = REGNODE_OFFSET(temp);
20720     }
20721
20722     /* Populate this node's next pointer */
20723     assert(val >= scan);
20724     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20725         assert((UV) (val - scan) <= U32_MAX);
20726         ARG_SET(REGNODE_p(scan), val - scan);
20727     }
20728     else {
20729         if (val - scan > U16_MAX) {
20730             /* Populate this with something that won't loop and will likely
20731              * lead to a crash if the caller ignores the failure return, and
20732              * execution continues */
20733             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20734             return FALSE;
20735         }
20736         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20737     }
20738
20739     return TRUE;
20740 }
20741
20742 #ifdef DEBUGGING
20743 /*
20744 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20745 - Look for optimizable sequences at the same time.
20746 - currently only looks for EXACT chains.
20747
20748 This is experimental code. The idea is to use this routine to perform
20749 in place optimizations on branches and groups as they are constructed,
20750 with the long term intention of removing optimization from study_chunk so
20751 that it is purely analytical.
20752
20753 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20754 to control which is which.
20755
20756 This used to return a value that was ignored.  It was a problem that it is
20757 #ifdef'd to be another function that didn't return a value.  khw has changed it
20758 so both currently return a pass/fail return.
20759
20760 */
20761 /* TODO: All four parms should be const */
20762
20763 STATIC bool
20764 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20765                       const regnode_offset val, U32 depth)
20766 {
20767     regnode_offset scan;
20768     U8 exact = PSEUDO;
20769 #ifdef EXPERIMENTAL_INPLACESCAN
20770     I32 min = 0;
20771 #endif
20772     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20773
20774     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20775
20776
20777     /* Find last node. */
20778
20779     scan = p;
20780     for (;;) {
20781         regnode * const temp = regnext(REGNODE_p(scan));
20782 #ifdef EXPERIMENTAL_INPLACESCAN
20783         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20784             bool unfolded_multi_char;   /* Unexamined in this routine */
20785             if (join_exact(pRExC_state, scan, &min,
20786                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20787                 return TRUE; /* Was return EXACT */
20788         }
20789 #endif
20790         if ( exact ) {
20791             if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20792                 if (exact == PSEUDO )
20793                     exact= OP(REGNODE_p(scan));
20794                 else if (exact != OP(REGNODE_p(scan)) )
20795                     exact= 0;
20796             }
20797             else if (OP(REGNODE_p(scan)) != NOTHING) {
20798                 exact= 0;
20799             }
20800         }
20801         DEBUG_PARSE_r({
20802             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20803             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20804             Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
20805                 SvPV_nolen_const(RExC_mysv),
20806                 scan,
20807                 PL_reg_name[exact]);
20808         });
20809         if (temp == NULL)
20810             break;
20811         scan = REGNODE_OFFSET(temp);
20812     }
20813     DEBUG_PARSE_r({
20814         DEBUG_PARSE_MSG("");
20815         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20816         Perl_re_printf( aTHX_
20817                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20818                       SvPV_nolen_const(RExC_mysv),
20819                       (IV)val,
20820                       (IV)(val - scan)
20821         );
20822     });
20823     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20824         assert((UV) (val - scan) <= U32_MAX);
20825         ARG_SET(REGNODE_p(scan), val - scan);
20826     }
20827     else {
20828         if (val - scan > U16_MAX) {
20829             /* Populate this with something that won't loop and will likely
20830              * lead to a crash if the caller ignores the failure return, and
20831              * execution continues */
20832             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20833             return FALSE;
20834         }
20835         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20836     }
20837
20838     return TRUE; /* Was 'return exact' */
20839 }
20840 #endif
20841
20842 STATIC SV*
20843 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20844
20845     /* Returns an inversion list of all the code points matched by the
20846      * ANYOFM/NANYOFM node 'n' */
20847
20848     SV * cp_list = _new_invlist(-1);
20849     const U8 lowest = (U8) ARG(n);
20850     unsigned int i;
20851     U8 count = 0;
20852     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20853
20854     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20855
20856     /* Starting with the lowest code point, any code point that ANDed with the
20857      * mask yields the lowest code point is in the set */
20858     for (i = lowest; i <= 0xFF; i++) {
20859         if ((i & FLAGS(n)) == ARG(n)) {
20860             cp_list = add_cp_to_invlist(cp_list, i);
20861             count++;
20862
20863             /* We know how many code points (a power of two) that are in the
20864              * set.  No use looking once we've got that number */
20865             if (count >= needed) break;
20866         }
20867     }
20868
20869     if (OP(n) == NANYOFM) {
20870         _invlist_invert(cp_list);
20871     }
20872     return cp_list;
20873 }
20874
20875 /*
20876  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20877  */
20878 #ifdef DEBUGGING
20879
20880 static void
20881 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20882 {
20883     int bit;
20884     int set=0;
20885
20886     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20887
20888     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20889         if (flags & (1<<bit)) {
20890             if (!set++ && lead)
20891                 Perl_re_printf( aTHX_  "%s", lead);
20892             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20893         }
20894     }
20895     if (lead)  {
20896         if (set)
20897             Perl_re_printf( aTHX_  "\n");
20898         else
20899             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20900     }
20901 }
20902
20903 static void
20904 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20905 {
20906     int bit;
20907     int set=0;
20908     regex_charset cs;
20909
20910     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20911
20912     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20913         if (flags & (1<<bit)) {
20914             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
20915                 continue;
20916             }
20917             if (!set++ && lead)
20918                 Perl_re_printf( aTHX_  "%s", lead);
20919             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20920         }
20921     }
20922     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20923             if (!set++ && lead) {
20924                 Perl_re_printf( aTHX_  "%s", lead);
20925             }
20926             switch (cs) {
20927                 case REGEX_UNICODE_CHARSET:
20928                     Perl_re_printf( aTHX_  "UNICODE");
20929                     break;
20930                 case REGEX_LOCALE_CHARSET:
20931                     Perl_re_printf( aTHX_  "LOCALE");
20932                     break;
20933                 case REGEX_ASCII_RESTRICTED_CHARSET:
20934                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20935                     break;
20936                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20937                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20938                     break;
20939                 default:
20940                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20941                     break;
20942             }
20943     }
20944     if (lead)  {
20945         if (set)
20946             Perl_re_printf( aTHX_  "\n");
20947         else
20948             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20949     }
20950 }
20951 #endif
20952
20953 void
20954 Perl_regdump(pTHX_ const regexp *r)
20955 {
20956 #ifdef DEBUGGING
20957     int i;
20958     SV * const sv = sv_newmortal();
20959     SV *dsv= sv_newmortal();
20960     RXi_GET_DECL(r, ri);
20961     DECLARE_AND_GET_RE_DEBUG_FLAGS;
20962
20963     PERL_ARGS_ASSERT_REGDUMP;
20964
20965     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20966
20967     /* Header fields of interest. */
20968     for (i = 0; i < 2; i++) {
20969         if (r->substrs->data[i].substr) {
20970             RE_PV_QUOTED_DECL(s, 0, dsv,
20971                             SvPVX_const(r->substrs->data[i].substr),
20972                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20973                             PL_dump_re_max_len);
20974             Perl_re_printf( aTHX_
20975                           "%s %s%s at %" IVdf "..%" UVuf " ",
20976                           i ? "floating" : "anchored",
20977                           s,
20978                           RE_SV_TAIL(r->substrs->data[i].substr),
20979                           (IV)r->substrs->data[i].min_offset,
20980                           (UV)r->substrs->data[i].max_offset);
20981         }
20982         else if (r->substrs->data[i].utf8_substr) {
20983             RE_PV_QUOTED_DECL(s, 1, dsv,
20984                             SvPVX_const(r->substrs->data[i].utf8_substr),
20985                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20986                             30);
20987             Perl_re_printf( aTHX_
20988                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20989                           i ? "floating" : "anchored",
20990                           s,
20991                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20992                           (IV)r->substrs->data[i].min_offset,
20993                           (UV)r->substrs->data[i].max_offset);
20994         }
20995     }
20996
20997     if (r->check_substr || r->check_utf8)
20998         Perl_re_printf( aTHX_
20999                       (const char *)
21000                       (   r->check_substr == r->substrs->data[1].substr
21001                        && r->check_utf8   == r->substrs->data[1].utf8_substr
21002                        ? "(checking floating" : "(checking anchored"));
21003     if (r->intflags & PREGf_NOSCAN)
21004         Perl_re_printf( aTHX_  " noscan");
21005     if (r->extflags & RXf_CHECK_ALL)
21006         Perl_re_printf( aTHX_  " isall");
21007     if (r->check_substr || r->check_utf8)
21008         Perl_re_printf( aTHX_  ") ");
21009
21010     if (ri->regstclass) {
21011         regprop(r, sv, ri->regstclass, NULL, NULL);
21012         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
21013     }
21014     if (r->intflags & PREGf_ANCH) {
21015         Perl_re_printf( aTHX_  "anchored");
21016         if (r->intflags & PREGf_ANCH_MBOL)
21017             Perl_re_printf( aTHX_  "(MBOL)");
21018         if (r->intflags & PREGf_ANCH_SBOL)
21019             Perl_re_printf( aTHX_  "(SBOL)");
21020         if (r->intflags & PREGf_ANCH_GPOS)
21021             Perl_re_printf( aTHX_  "(GPOS)");
21022         Perl_re_printf( aTHX_ " ");
21023     }
21024     if (r->intflags & PREGf_GPOS_SEEN)
21025         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
21026     if (r->intflags & PREGf_SKIP)
21027         Perl_re_printf( aTHX_  "plus ");
21028     if (r->intflags & PREGf_IMPLICIT)
21029         Perl_re_printf( aTHX_  "implicit ");
21030     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
21031     if (r->extflags & RXf_EVAL_SEEN)
21032         Perl_re_printf( aTHX_  "with eval ");
21033     Perl_re_printf( aTHX_  "\n");
21034     DEBUG_FLAGS_r({
21035         regdump_extflags("r->extflags: ", r->extflags);
21036         regdump_intflags("r->intflags: ", r->intflags);
21037     });
21038 #else
21039     PERL_ARGS_ASSERT_REGDUMP;
21040     PERL_UNUSED_CONTEXT;
21041     PERL_UNUSED_ARG(r);
21042 #endif  /* DEBUGGING */
21043 }
21044
21045 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21046 #ifdef DEBUGGING
21047
21048 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
21049      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
21050      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
21051      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
21052      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
21053      || _CC_VERTSPACE != 15
21054 #   error Need to adjust order of anyofs[]
21055 #  endif
21056 static const char * const anyofs[] = {
21057     "\\w",
21058     "\\W",
21059     "\\d",
21060     "\\D",
21061     "[:alpha:]",
21062     "[:^alpha:]",
21063     "[:lower:]",
21064     "[:^lower:]",
21065     "[:upper:]",
21066     "[:^upper:]",
21067     "[:punct:]",
21068     "[:^punct:]",
21069     "[:print:]",
21070     "[:^print:]",
21071     "[:alnum:]",
21072     "[:^alnum:]",
21073     "[:graph:]",
21074     "[:^graph:]",
21075     "[:cased:]",
21076     "[:^cased:]",
21077     "\\s",
21078     "\\S",
21079     "[:blank:]",
21080     "[:^blank:]",
21081     "[:xdigit:]",
21082     "[:^xdigit:]",
21083     "[:cntrl:]",
21084     "[:^cntrl:]",
21085     "[:ascii:]",
21086     "[:^ascii:]",
21087     "\\v",
21088     "\\V"
21089 };
21090 #endif
21091
21092 /*
21093 - regprop - printable representation of opcode, with run time support
21094 */
21095
21096 void
21097 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21098 {
21099 #ifdef DEBUGGING
21100     int k;
21101     RXi_GET_DECL(prog, progi);
21102     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21103
21104     PERL_ARGS_ASSERT_REGPROP;
21105
21106     SvPVCLEAR(sv);
21107
21108     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
21109         if (pRExC_state) {  /* This gives more info, if we have it */
21110             FAIL3("panic: corrupted regexp opcode %d > %d",
21111                   (int)OP(o), (int)REGNODE_MAX);
21112         }
21113         else {
21114             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21115                              (int)OP(o), (int)REGNODE_MAX);
21116         }
21117     }
21118     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21119
21120     k = PL_regkind[OP(o)];
21121
21122     if (k == EXACT) {
21123         sv_catpvs(sv, " ");
21124         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21125          * is a crude hack but it may be the best for now since
21126          * we have no flag "this EXACTish node was UTF-8"
21127          * --jhi */
21128         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21129                   PL_colors[0], PL_colors[1],
21130                   PERL_PV_ESCAPE_UNI_DETECT |
21131                   PERL_PV_ESCAPE_NONASCII   |
21132                   PERL_PV_PRETTY_ELLIPSES   |
21133                   PERL_PV_PRETTY_LTGT       |
21134                   PERL_PV_PRETTY_NOCLEAR
21135                   );
21136     } else if (k == TRIE) {
21137         /* print the details of the trie in dumpuntil instead, as
21138          * progi->data isn't available here */
21139         const char op = OP(o);
21140         const U32 n = ARG(o);
21141         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21142                (reg_ac_data *)progi->data->data[n] :
21143                NULL;
21144         const reg_trie_data * const trie
21145             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21146
21147         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21148         DEBUG_TRIE_COMPILE_r({
21149           if (trie->jump)
21150             sv_catpvs(sv, "(JUMP)");
21151           Perl_sv_catpvf(aTHX_ sv,
21152             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21153             (UV)trie->startstate,
21154             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21155             (UV)trie->wordcount,
21156             (UV)trie->minlen,
21157             (UV)trie->maxlen,
21158             (UV)TRIE_CHARCOUNT(trie),
21159             (UV)trie->uniquecharcount
21160           );
21161         });
21162         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21163             sv_catpvs(sv, "[");
21164             (void) put_charclass_bitmap_innards(sv,
21165                                                 ((IS_ANYOF_TRIE(op))
21166                                                  ? ANYOF_BITMAP(o)
21167                                                  : TRIE_BITMAP(trie)),
21168                                                 NULL,
21169                                                 NULL,
21170                                                 NULL,
21171                                                 0,
21172                                                 FALSE
21173                                                );
21174             sv_catpvs(sv, "]");
21175         }
21176     } else if (k == CURLY) {
21177         U32 lo = ARG1(o), hi = ARG2(o);
21178         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21179             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21180         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21181         if (hi == REG_INFTY)
21182             sv_catpvs(sv, "INFTY");
21183         else
21184             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21185         sv_catpvs(sv, "}");
21186     }
21187     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
21188         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21189     else if (k == REF || k == OPEN || k == CLOSE
21190              || k == GROUPP || OP(o)==ACCEPT)
21191     {
21192         AV *name_list= NULL;
21193         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21194         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
21195         if ( RXp_PAREN_NAMES(prog) ) {
21196             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21197         } else if ( pRExC_state ) {
21198             name_list= RExC_paren_name_list;
21199         }
21200         if (name_list) {
21201             if ( k != REF || (OP(o) < REFN)) {
21202                 SV **name= av_fetch(name_list, parno, 0 );
21203                 if (name)
21204                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21205             }
21206             else {
21207                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21208                 I32 *nums=(I32*)SvPVX(sv_dat);
21209                 SV **name= av_fetch(name_list, nums[0], 0 );
21210                 I32 n;
21211                 if (name) {
21212                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
21213                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21214                                     (n ? "," : ""), (IV)nums[n]);
21215                     }
21216                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21217                 }
21218             }
21219         }
21220         if ( k == REF && reginfo) {
21221             U32 n = ARG(o);  /* which paren pair */
21222             I32 ln = prog->offs[n].start;
21223             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21224                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21225             else if (ln == prog->offs[n].end)
21226                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21227             else {
21228                 const char *s = reginfo->strbeg + ln;
21229                 Perl_sv_catpvf(aTHX_ sv, ": ");
21230                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21231                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21232             }
21233         }
21234     } else if (k == GOSUB) {
21235         AV *name_list= NULL;
21236         if ( RXp_PAREN_NAMES(prog) ) {
21237             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21238         } else if ( pRExC_state ) {
21239             name_list= RExC_paren_name_list;
21240         }
21241
21242         /* Paren and offset */
21243         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21244                 (int)((o + (int)ARG2L(o)) - progi->program) );
21245         if (name_list) {
21246             SV **name= av_fetch(name_list, ARG(o), 0 );
21247             if (name)
21248                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21249         }
21250     }
21251     else if (k == LOGICAL)
21252         /* 2: embedded, otherwise 1 */
21253         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21254     else if (k == ANYOF || k == ANYOFR) {
21255         U8 flags;
21256         char * bitmap;
21257         U32 arg;
21258         bool do_sep = FALSE;    /* Do we need to separate various components of
21259                                    the output? */
21260         /* Set if there is still an unresolved user-defined property */
21261         SV *unresolved                = NULL;
21262
21263         /* Things that are ignored except when the runtime locale is UTF-8 */
21264         SV *only_utf8_locale_invlist = NULL;
21265
21266         /* Code points that don't fit in the bitmap */
21267         SV *nonbitmap_invlist = NULL;
21268
21269         /* And things that aren't in the bitmap, but are small enough to be */
21270         SV* bitmap_range_not_in_bitmap = NULL;
21271
21272         bool inverted;
21273
21274         if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21275             flags = 0;
21276             bitmap = NULL;
21277             arg = 0;
21278         }
21279         else {
21280             flags = ANYOF_FLAGS(o);
21281             bitmap = ANYOF_BITMAP(o);
21282             arg = ARG(o);
21283         }
21284
21285         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21286             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21287                 sv_catpvs(sv, "{utf8-locale-reqd}");
21288             }
21289             if (flags & ANYOFL_FOLD) {
21290                 sv_catpvs(sv, "{i}");
21291             }
21292         }
21293
21294         inverted = flags & ANYOF_INVERT;
21295
21296         /* If there is stuff outside the bitmap, get it */
21297         if (arg != ANYOF_ONLY_HAS_BITMAP) {
21298             if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21299                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21300                                             ANYOFRbase(o),
21301                                             ANYOFRbase(o) + ANYOFRdelta(o));
21302             }
21303             else {
21304 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21305                 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21306                                                 &unresolved,
21307                                                 &only_utf8_locale_invlist,
21308                                                 &nonbitmap_invlist);
21309 #else
21310                 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21311                                                 &unresolved,
21312                                                 &only_utf8_locale_invlist,
21313                                                 &nonbitmap_invlist);
21314 #endif
21315             }
21316
21317             /* The non-bitmap data may contain stuff that could fit in the
21318              * bitmap.  This could come from a user-defined property being
21319              * finally resolved when this call was done; or much more likely
21320              * because there are matches that require UTF-8 to be valid, and so
21321              * aren't in the bitmap (or ANYOFR).  This is teased apart later */
21322             _invlist_intersection(nonbitmap_invlist,
21323                                   PL_InBitmap,
21324                                   &bitmap_range_not_in_bitmap);
21325             /* Leave just the things that don't fit into the bitmap */
21326             _invlist_subtract(nonbitmap_invlist,
21327                               PL_InBitmap,
21328                               &nonbitmap_invlist);
21329         }
21330
21331         /* Obey this flag to add all above-the-bitmap code points */
21332         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21333             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21334                                                       NUM_ANYOF_CODE_POINTS,
21335                                                       UV_MAX);
21336         }
21337
21338         /* Ready to start outputting.  First, the initial left bracket */
21339         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21340
21341         /* ANYOFH by definition doesn't have anything that will fit inside the
21342          * bitmap;  ANYOFR may or may not. */
21343         if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21344             && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21345                 ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21346         {
21347             /* Then all the things that could fit in the bitmap */
21348             do_sep = put_charclass_bitmap_innards(sv,
21349                                                   bitmap,
21350                                                   bitmap_range_not_in_bitmap,
21351                                                   only_utf8_locale_invlist,
21352                                                   o,
21353                                                   flags,
21354
21355                                                   /* Can't try inverting for a
21356                                                    * better display if there
21357                                                    * are things that haven't
21358                                                    * been resolved */
21359                                                   unresolved != NULL
21360                                             || inRANGE(OP(o), ANYOFR, ANYOFRb));
21361             SvREFCNT_dec(bitmap_range_not_in_bitmap);
21362
21363             /* If there are user-defined properties which haven't been defined
21364              * yet, output them.  If the result is not to be inverted, it is
21365              * clearest to output them in a separate [] from the bitmap range
21366              * stuff.  If the result is to be complemented, we have to show
21367              * everything in one [], as the inversion applies to the whole
21368              * thing.  Use {braces} to separate them from anything in the
21369              * bitmap and anything above the bitmap. */
21370             if (unresolved) {
21371                 if (inverted) {
21372                     if (! do_sep) { /* If didn't output anything in the bitmap
21373                                      */
21374                         sv_catpvs(sv, "^");
21375                     }
21376                     sv_catpvs(sv, "{");
21377                 }
21378                 else if (do_sep) {
21379                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21380                                                       PL_colors[0]);
21381                 }
21382                 sv_catsv(sv, unresolved);
21383                 if (inverted) {
21384                     sv_catpvs(sv, "}");
21385                 }
21386                 do_sep = ! inverted;
21387             }
21388         }
21389
21390         /* And, finally, add the above-the-bitmap stuff */
21391         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21392             SV* contents;
21393
21394             /* See if truncation size is overridden */
21395             const STRLEN dump_len = (PL_dump_re_max_len > 256)
21396                                     ? PL_dump_re_max_len
21397                                     : 256;
21398
21399             /* This is output in a separate [] */
21400             if (do_sep) {
21401                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21402             }
21403
21404             /* And, for easy of understanding, it is shown in the
21405              * uncomplemented form if possible.  The one exception being if
21406              * there are unresolved items, where the inversion has to be
21407              * delayed until runtime */
21408             if (inverted && ! unresolved) {
21409                 _invlist_invert(nonbitmap_invlist);
21410                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21411             }
21412
21413             contents = invlist_contents(nonbitmap_invlist,
21414                                         FALSE /* output suitable for catsv */
21415                                        );
21416
21417             /* If the output is shorter than the permissible maximum, just do it. */
21418             if (SvCUR(contents) <= dump_len) {
21419                 sv_catsv(sv, contents);
21420             }
21421             else {
21422                 const char * contents_string = SvPVX(contents);
21423                 STRLEN i = dump_len;
21424
21425                 /* Otherwise, start at the permissible max and work back to the
21426                  * first break possibility */
21427                 while (i > 0 && contents_string[i] != ' ') {
21428                     i--;
21429                 }
21430                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
21431                                        find a legal break */
21432                     i = dump_len;
21433                 }
21434
21435                 sv_catpvn(sv, contents_string, i);
21436                 sv_catpvs(sv, "...");
21437             }
21438
21439             SvREFCNT_dec_NN(contents);
21440             SvREFCNT_dec_NN(nonbitmap_invlist);
21441         }
21442
21443         /* And finally the matching, closing ']' */
21444         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21445
21446         if (OP(o) == ANYOFHs) {
21447             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21448         }
21449         else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21450             U8 lowest = (OP(o) != ANYOFHr)
21451                          ? FLAGS(o)
21452                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21453             U8 highest = (OP(o) == ANYOFHr)
21454                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21455                          : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21456                            ? 0xFF
21457                            : lowest;
21458 #ifndef EBCDIC
21459             if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21460 #endif
21461             {
21462                 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21463                 if (lowest != highest) {
21464                     Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21465                 }
21466                 Perl_sv_catpvf(aTHX_ sv, ")");
21467             }
21468         }
21469
21470         SvREFCNT_dec(unresolved);
21471     }
21472     else if (k == ANYOFM) {
21473         SV * cp_list = get_ANYOFM_contents(o);
21474
21475         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21476         if (OP(o) == NANYOFM) {
21477             _invlist_invert(cp_list);
21478         }
21479
21480         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21481         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21482
21483         SvREFCNT_dec(cp_list);
21484     }
21485     else if (k == POSIXD || k == NPOSIXD) {
21486         U8 index = FLAGS(o) * 2;
21487         if (index < C_ARRAY_LENGTH(anyofs)) {
21488             if (*anyofs[index] != '[')  {
21489                 sv_catpvs(sv, "[");
21490             }
21491             sv_catpv(sv, anyofs[index]);
21492             if (*anyofs[index] != '[')  {
21493                 sv_catpvs(sv, "]");
21494             }
21495         }
21496         else {
21497             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21498         }
21499     }
21500     else if (k == BOUND || k == NBOUND) {
21501         /* Must be synced with order of 'bound_type' in regcomp.h */
21502         const char * const bounds[] = {
21503             "",      /* Traditional */
21504             "{gcb}",
21505             "{lb}",
21506             "{sb}",
21507             "{wb}"
21508         };
21509         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21510         sv_catpv(sv, bounds[FLAGS(o)]);
21511     }
21512     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21513         Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21514         if (o->next_off) {
21515             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21516         }
21517         Perl_sv_catpvf(aTHX_ sv, "]");
21518     }
21519     else if (OP(o) == SBOL)
21520         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21521
21522     /* add on the verb argument if there is one */
21523     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21524         if ( ARG(o) )
21525             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21526                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21527         else
21528             sv_catpvs(sv, ":NULL");
21529     }
21530 #else
21531     PERL_UNUSED_CONTEXT;
21532     PERL_UNUSED_ARG(sv);
21533     PERL_UNUSED_ARG(o);
21534     PERL_UNUSED_ARG(prog);
21535     PERL_UNUSED_ARG(reginfo);
21536     PERL_UNUSED_ARG(pRExC_state);
21537 #endif  /* DEBUGGING */
21538 }
21539
21540
21541
21542 SV *
21543 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21544 {                               /* Assume that RE_INTUIT is set */
21545     /* Returns an SV containing a string that must appear in the target for it
21546      * to match, or NULL if nothing is known that must match.
21547      *
21548      * CAUTION: the SV can be freed during execution of the regex engine */
21549
21550     struct regexp *const prog = ReANY(r);
21551     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21552
21553     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21554     PERL_UNUSED_CONTEXT;
21555
21556     DEBUG_COMPILE_r(
21557         {
21558             if (prog->maxlen > 0) {
21559                 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21560                       ? prog->check_utf8 : prog->check_substr);
21561
21562                 if (!PL_colorset) reginitcolors();
21563                 Perl_re_printf( aTHX_
21564                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21565                       PL_colors[4],
21566                       RX_UTF8(r) ? "utf8 " : "",
21567                       PL_colors[5], PL_colors[0],
21568                       s,
21569                       PL_colors[1],
21570                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21571             }
21572         } );
21573
21574     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21575     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21576 }
21577
21578 /*
21579    pregfree()
21580
21581    handles refcounting and freeing the perl core regexp structure. When
21582    it is necessary to actually free the structure the first thing it
21583    does is call the 'free' method of the regexp_engine associated to
21584    the regexp, allowing the handling of the void *pprivate; member
21585    first. (This routine is not overridable by extensions, which is why
21586    the extensions free is called first.)
21587
21588    See regdupe and regdupe_internal if you change anything here.
21589 */
21590 #ifndef PERL_IN_XSUB_RE
21591 void
21592 Perl_pregfree(pTHX_ REGEXP *r)
21593 {
21594     SvREFCNT_dec(r);
21595 }
21596
21597 void
21598 Perl_pregfree2(pTHX_ REGEXP *rx)
21599 {
21600     struct regexp *const r = ReANY(rx);
21601     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21602
21603     PERL_ARGS_ASSERT_PREGFREE2;
21604
21605     if (! r)
21606         return;
21607
21608     if (r->mother_re) {
21609         ReREFCNT_dec(r->mother_re);
21610     } else {
21611         CALLREGFREE_PVT(rx); /* free the private data */
21612         SvREFCNT_dec(RXp_PAREN_NAMES(r));
21613     }
21614     if (r->substrs) {
21615         int i;
21616         for (i = 0; i < 2; i++) {
21617             SvREFCNT_dec(r->substrs->data[i].substr);
21618             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21619         }
21620         Safefree(r->substrs);
21621     }
21622     RX_MATCH_COPY_FREE(rx);
21623 #ifdef PERL_ANY_COW
21624     SvREFCNT_dec(r->saved_copy);
21625 #endif
21626     Safefree(r->offs);
21627     SvREFCNT_dec(r->qr_anoncv);
21628     if (r->recurse_locinput)
21629         Safefree(r->recurse_locinput);
21630 }
21631
21632
21633 /*  reg_temp_copy()
21634
21635     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21636     except that dsv will be created if NULL.
21637
21638     This function is used in two main ways. First to implement
21639         $r = qr/....; $s = $$r;
21640
21641     Secondly, it is used as a hacky workaround to the structural issue of
21642     match results
21643     being stored in the regexp structure which is in turn stored in
21644     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21645     could be PL_curpm in multiple contexts, and could require multiple
21646     result sets being associated with the pattern simultaneously, such
21647     as when doing a recursive match with (??{$qr})
21648
21649     The solution is to make a lightweight copy of the regexp structure
21650     when a qr// is returned from the code executed by (??{$qr}) this
21651     lightweight copy doesn't actually own any of its data except for
21652     the starp/end and the actual regexp structure itself.
21653
21654 */
21655
21656
21657 REGEXP *
21658 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21659 {
21660     struct regexp *drx;
21661     struct regexp *const srx = ReANY(ssv);
21662     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21663
21664     PERL_ARGS_ASSERT_REG_TEMP_COPY;
21665
21666     if (!dsv)
21667         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21668     else {
21669         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21670
21671         /* our only valid caller, sv_setsv_flags(), should have done
21672          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21673         assert(!SvOOK(dsv));
21674         assert(!SvIsCOW(dsv));
21675         assert(!SvROK(dsv));
21676
21677         if (SvPVX_const(dsv)) {
21678             if (SvLEN(dsv))
21679                 Safefree(SvPVX(dsv));
21680             SvPVX(dsv) = NULL;
21681         }
21682         SvLEN_set(dsv, 0);
21683         SvCUR_set(dsv, 0);
21684         SvOK_off((SV *)dsv);
21685
21686         if (islv) {
21687             /* For PVLVs, the head (sv_any) points to an XPVLV, while
21688              * the LV's xpvlenu_rx will point to a regexp body, which
21689              * we allocate here */
21690             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21691             assert(!SvPVX(dsv));
21692             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21693             temp->sv_any = NULL;
21694             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21695             SvREFCNT_dec_NN(temp);
21696             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21697                ing below will not set it. */
21698             SvCUR_set(dsv, SvCUR(ssv));
21699         }
21700     }
21701     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21702        sv_force_normal(sv) is called.  */
21703     SvFAKE_on(dsv);
21704     drx = ReANY(dsv);
21705
21706     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21707     SvPV_set(dsv, RX_WRAPPED(ssv));
21708     /* We share the same string buffer as the original regexp, on which we
21709        hold a reference count, incremented when mother_re is set below.
21710        The string pointer is copied here, being part of the regexp struct.
21711      */
21712     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21713            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21714     if (!islv)
21715         SvLEN_set(dsv, 0);
21716     if (srx->offs) {
21717         const I32 npar = srx->nparens+1;
21718         Newx(drx->offs, npar, regexp_paren_pair);
21719         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21720     }
21721     if (srx->substrs) {
21722         int i;
21723         Newx(drx->substrs, 1, struct reg_substr_data);
21724         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21725
21726         for (i = 0; i < 2; i++) {
21727             SvREFCNT_inc_void(drx->substrs->data[i].substr);
21728             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21729         }
21730
21731         /* check_substr and check_utf8, if non-NULL, point to either their
21732            anchored or float namesakes, and don't hold a second reference.  */
21733     }
21734     RX_MATCH_COPIED_off(dsv);
21735 #ifdef PERL_ANY_COW
21736     drx->saved_copy = NULL;
21737 #endif
21738     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21739     SvREFCNT_inc_void(drx->qr_anoncv);
21740     if (srx->recurse_locinput)
21741         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21742
21743     return dsv;
21744 }
21745 #endif
21746
21747
21748 /* regfree_internal()
21749
21750    Free the private data in a regexp. This is overloadable by
21751    extensions. Perl takes care of the regexp structure in pregfree(),
21752    this covers the *pprivate pointer which technically perl doesn't
21753    know about, however of course we have to handle the
21754    regexp_internal structure when no extension is in use.
21755
21756    Note this is called before freeing anything in the regexp
21757    structure.
21758  */
21759
21760 void
21761 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21762 {
21763     struct regexp *const r = ReANY(rx);
21764     RXi_GET_DECL(r, ri);
21765     DECLARE_AND_GET_RE_DEBUG_FLAGS;
21766
21767     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21768
21769     if (! ri) {
21770         return;
21771     }
21772
21773     DEBUG_COMPILE_r({
21774         if (!PL_colorset)
21775             reginitcolors();
21776         {
21777             SV *dsv= sv_newmortal();
21778             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21779                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21780             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21781                 PL_colors[4], PL_colors[5], s);
21782         }
21783     });
21784
21785 #ifdef RE_TRACK_PATTERN_OFFSETS
21786     if (ri->u.offsets)
21787         Safefree(ri->u.offsets);             /* 20010421 MJD */
21788 #endif
21789     if (ri->code_blocks)
21790         S_free_codeblocks(aTHX_ ri->code_blocks);
21791
21792     if (ri->data) {
21793         int n = ri->data->count;
21794
21795         while (--n >= 0) {
21796           /* If you add a ->what type here, update the comment in regcomp.h */
21797             switch (ri->data->what[n]) {
21798             case 'a':
21799             case 'r':
21800             case 's':
21801             case 'S':
21802             case 'u':
21803                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21804                 break;
21805             case 'f':
21806                 Safefree(ri->data->data[n]);
21807                 break;
21808             case 'l':
21809             case 'L':
21810                 break;
21811             case 'T':
21812                 { /* Aho Corasick add-on structure for a trie node.
21813                      Used in stclass optimization only */
21814                     U32 refcount;
21815                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21816 #ifdef USE_ITHREADS
21817 #endif
21818                     OP_REFCNT_LOCK;
21819                     refcount = --aho->refcount;
21820                     OP_REFCNT_UNLOCK;
21821                     if ( !refcount ) {
21822                         PerlMemShared_free(aho->states);
21823                         PerlMemShared_free(aho->fail);
21824                          /* do this last!!!! */
21825                         PerlMemShared_free(ri->data->data[n]);
21826                         /* we should only ever get called once, so
21827                          * assert as much, and also guard the free
21828                          * which /might/ happen twice. At the least
21829                          * it will make code anlyzers happy and it
21830                          * doesn't cost much. - Yves */
21831                         assert(ri->regstclass);
21832                         if (ri->regstclass) {
21833                             PerlMemShared_free(ri->regstclass);
21834                             ri->regstclass = 0;
21835                         }
21836                     }
21837                 }
21838                 break;
21839             case 't':
21840                 {
21841                     /* trie structure. */
21842                     U32 refcount;
21843                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21844 #ifdef USE_ITHREADS
21845 #endif
21846                     OP_REFCNT_LOCK;
21847                     refcount = --trie->refcount;
21848                     OP_REFCNT_UNLOCK;
21849                     if ( !refcount ) {
21850                         PerlMemShared_free(trie->charmap);
21851                         PerlMemShared_free(trie->states);
21852                         PerlMemShared_free(trie->trans);
21853                         if (trie->bitmap)
21854                             PerlMemShared_free(trie->bitmap);
21855                         if (trie->jump)
21856                             PerlMemShared_free(trie->jump);
21857                         PerlMemShared_free(trie->wordinfo);
21858                         /* do this last!!!! */
21859                         PerlMemShared_free(ri->data->data[n]);
21860                     }
21861                 }
21862                 break;
21863             default:
21864                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21865                                                     ri->data->what[n]);
21866             }
21867         }
21868         Safefree(ri->data->what);
21869         Safefree(ri->data);
21870     }
21871
21872     Safefree(ri);
21873 }
21874
21875 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21876 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21877 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
21878
21879 /*
21880 =for apidoc re_dup_guts
21881 Duplicate a regexp.
21882
21883 This routine is expected to clone a given regexp structure. It is only
21884 compiled under USE_ITHREADS.
21885
21886 After all of the core data stored in struct regexp is duplicated
21887 the C<regexp_engine.dupe> method is used to copy any private data
21888 stored in the *pprivate pointer. This allows extensions to handle
21889 any duplication they need to do.
21890
21891 =cut
21892
21893    See pregfree() and regfree_internal() if you change anything here.
21894 */
21895 #if defined(USE_ITHREADS)
21896 #ifndef PERL_IN_XSUB_RE
21897 void
21898 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21899 {
21900     I32 npar;
21901     const struct regexp *r = ReANY(sstr);
21902     struct regexp *ret = ReANY(dstr);
21903
21904     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21905
21906     npar = r->nparens+1;
21907     Newx(ret->offs, npar, regexp_paren_pair);
21908     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21909
21910     if (ret->substrs) {
21911         /* Do it this way to avoid reading from *r after the StructCopy().
21912            That way, if any of the sv_dup_inc()s dislodge *r from the L1
21913            cache, it doesn't matter.  */
21914         int i;
21915         const bool anchored = r->check_substr
21916             ? r->check_substr == r->substrs->data[0].substr
21917             : r->check_utf8   == r->substrs->data[0].utf8_substr;
21918         Newx(ret->substrs, 1, struct reg_substr_data);
21919         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21920
21921         for (i = 0; i < 2; i++) {
21922             ret->substrs->data[i].substr =
21923                         sv_dup_inc(ret->substrs->data[i].substr, param);
21924             ret->substrs->data[i].utf8_substr =
21925                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21926         }
21927
21928         /* check_substr and check_utf8, if non-NULL, point to either their
21929            anchored or float namesakes, and don't hold a second reference.  */
21930
21931         if (ret->check_substr) {
21932             if (anchored) {
21933                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21934
21935                 ret->check_substr = ret->substrs->data[0].substr;
21936                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21937             } else {
21938                 assert(r->check_substr == r->substrs->data[1].substr);
21939                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21940
21941                 ret->check_substr = ret->substrs->data[1].substr;
21942                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21943             }
21944         } else if (ret->check_utf8) {
21945             if (anchored) {
21946                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21947             } else {
21948                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21949             }
21950         }
21951     }
21952
21953     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21954     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21955     if (r->recurse_locinput)
21956         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21957
21958     if (ret->pprivate)
21959         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21960
21961     if (RX_MATCH_COPIED(dstr))
21962         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21963     else
21964         ret->subbeg = NULL;
21965 #ifdef PERL_ANY_COW
21966     ret->saved_copy = NULL;
21967 #endif
21968
21969     /* Whether mother_re be set or no, we need to copy the string.  We
21970        cannot refrain from copying it when the storage points directly to
21971        our mother regexp, because that's
21972                1: a buffer in a different thread
21973                2: something we no longer hold a reference on
21974                so we need to copy it locally.  */
21975     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21976     /* set malloced length to a non-zero value so it will be freed
21977      * (otherwise in combination with SVf_FAKE it looks like an alien
21978      * buffer). It doesn't have to be the actual malloced size, since it
21979      * should never be grown */
21980     SvLEN_set(dstr, SvCUR(sstr)+1);
21981     ret->mother_re   = NULL;
21982 }
21983 #endif /* PERL_IN_XSUB_RE */
21984
21985 /*
21986    regdupe_internal()
21987
21988    This is the internal complement to regdupe() which is used to copy
21989    the structure pointed to by the *pprivate pointer in the regexp.
21990    This is the core version of the extension overridable cloning hook.
21991    The regexp structure being duplicated will be copied by perl prior
21992    to this and will be provided as the regexp *r argument, however
21993    with the /old/ structures pprivate pointer value. Thus this routine
21994    may override any copying normally done by perl.
21995
21996    It returns a pointer to the new regexp_internal structure.
21997 */
21998
21999 void *
22000 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
22001 {
22002     struct regexp *const r = ReANY(rx);
22003     regexp_internal *reti;
22004     int len;
22005     RXi_GET_DECL(r, ri);
22006
22007     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
22008
22009     len = ProgLen(ri);
22010
22011     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
22012           char, regexp_internal);
22013     Copy(ri->program, reti->program, len+1, regnode);
22014
22015
22016     if (ri->code_blocks) {
22017         int n;
22018         Newx(reti->code_blocks, 1, struct reg_code_blocks);
22019         Newx(reti->code_blocks->cb, ri->code_blocks->count,
22020                     struct reg_code_block);
22021         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22022              ri->code_blocks->count, struct reg_code_block);
22023         for (n = 0; n < ri->code_blocks->count; n++)
22024              reti->code_blocks->cb[n].src_regex = (REGEXP*)
22025                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22026         reti->code_blocks->count = ri->code_blocks->count;
22027         reti->code_blocks->refcnt = 1;
22028     }
22029     else
22030         reti->code_blocks = NULL;
22031
22032     reti->regstclass = NULL;
22033
22034     if (ri->data) {
22035         struct reg_data *d;
22036         const int count = ri->data->count;
22037         int i;
22038
22039         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22040                 char, struct reg_data);
22041         Newx(d->what, count, U8);
22042
22043         d->count = count;
22044         for (i = 0; i < count; i++) {
22045             d->what[i] = ri->data->what[i];
22046             switch (d->what[i]) {
22047                 /* see also regcomp.h and regfree_internal() */
22048             case 'a': /* actually an AV, but the dup function is identical.
22049                          values seem to be "plain sv's" generally. */
22050             case 'r': /* a compiled regex (but still just another SV) */
22051             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22052                          this use case should go away, the code could have used
22053                          'a' instead - see S_set_ANYOF_arg() for array contents. */
22054             case 'S': /* actually an SV, but the dup function is identical.  */
22055             case 'u': /* actually an HV, but the dup function is identical.
22056                          values are "plain sv's" */
22057                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22058                 break;
22059             case 'f':
22060                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22061                  * patterns which could start with several different things. Pre-TRIE
22062                  * this was more important than it is now, however this still helps
22063                  * in some places, for instance /x?a+/ might produce a SSC equivalent
22064                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22065                  * in regexec.c
22066                  */
22067                 /* This is cheating. */
22068                 Newx(d->data[i], 1, regnode_ssc);
22069                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22070                 reti->regstclass = (regnode*)d->data[i];
22071                 break;
22072             case 'T':
22073                 /* AHO-CORASICK fail table */
22074                 /* Trie stclasses are readonly and can thus be shared
22075                  * without duplication. We free the stclass in pregfree
22076                  * when the corresponding reg_ac_data struct is freed.
22077                  */
22078                 reti->regstclass= ri->regstclass;
22079                 /* FALLTHROUGH */
22080             case 't':
22081                 /* TRIE transition table */
22082                 OP_REFCNT_LOCK;
22083                 ((reg_trie_data*)ri->data->data[i])->refcount++;
22084                 OP_REFCNT_UNLOCK;
22085                 /* FALLTHROUGH */
22086             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22087             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22088                          is not from another regexp */
22089                 d->data[i] = ri->data->data[i];
22090                 break;
22091             default:
22092                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22093                                                            ri->data->what[i]);
22094             }
22095         }
22096
22097         reti->data = d;
22098     }
22099     else
22100         reti->data = NULL;
22101
22102     reti->name_list_idx = ri->name_list_idx;
22103
22104 #ifdef RE_TRACK_PATTERN_OFFSETS
22105     if (ri->u.offsets) {
22106         Newx(reti->u.offsets, 2*len+1, U32);
22107         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22108     }
22109 #else
22110     SetProgLen(reti, len);
22111 #endif
22112
22113     return (void*)reti;
22114 }
22115
22116 #endif    /* USE_ITHREADS */
22117
22118 #ifndef PERL_IN_XSUB_RE
22119
22120 /*
22121  - regnext - dig the "next" pointer out of a node
22122  */
22123 regnode *
22124 Perl_regnext(pTHX_ regnode *p)
22125 {
22126     I32 offset;
22127
22128     if (!p)
22129         return(NULL);
22130
22131     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
22132         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22133                                                 (int)OP(p), (int)REGNODE_MAX);
22134     }
22135
22136     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22137     if (offset == 0)
22138         return(NULL);
22139
22140     return(p+offset);
22141 }
22142
22143 #endif
22144
22145 STATIC void
22146 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22147 {
22148     va_list args;
22149     STRLEN len = strlen(pat);
22150     char buf[512];
22151     SV *msv;
22152     const char *message;
22153
22154     PERL_ARGS_ASSERT_RE_CROAK;
22155
22156     if (len > 510)
22157         len = 510;
22158     Copy(pat, buf, len , char);
22159     buf[len] = '\n';
22160     buf[len + 1] = '\0';
22161     va_start(args, pat);
22162     msv = vmess(buf, &args);
22163     va_end(args);
22164     message = SvPV_const(msv, len);
22165     if (len > 512)
22166         len = 512;
22167     Copy(message, buf, len , char);
22168     /* len-1 to avoid \n */
22169     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22170 }
22171
22172 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
22173
22174 #ifndef PERL_IN_XSUB_RE
22175 void
22176 Perl_save_re_context(pTHX)
22177 {
22178     I32 nparens = -1;
22179     I32 i;
22180
22181     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22182
22183     if (PL_curpm) {
22184         const REGEXP * const rx = PM_GETRE(PL_curpm);
22185         if (rx)
22186             nparens = RX_NPARENS(rx);
22187     }
22188
22189     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22190      * that PL_curpm will be null, but that utf8.pm and the modules it
22191      * loads will only use $1..$3.
22192      * The t/porting/re_context.t test file checks this assumption.
22193      */
22194     if (nparens == -1)
22195         nparens = 3;
22196
22197     for (i = 1; i <= nparens; i++) {
22198         char digits[TYPE_CHARS(long)];
22199         const STRLEN len = my_snprintf(digits, sizeof(digits),
22200                                        "%lu", (long)i);
22201         GV *const *const gvp
22202             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22203
22204         if (gvp) {
22205             GV * const gv = *gvp;
22206             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22207                 save_scalar(gv);
22208         }
22209     }
22210 }
22211 #endif
22212
22213 #ifdef DEBUGGING
22214
22215 STATIC void
22216 S_put_code_point(pTHX_ SV *sv, UV c)
22217 {
22218     PERL_ARGS_ASSERT_PUT_CODE_POINT;
22219
22220     if (c > 255) {
22221         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22222     }
22223     else if (isPRINT(c)) {
22224         const char string = (char) c;
22225
22226         /* We use {phrase} as metanotation in the class, so also escape literal
22227          * braces */
22228         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22229             sv_catpvs(sv, "\\");
22230         sv_catpvn(sv, &string, 1);
22231     }
22232     else if (isMNEMONIC_CNTRL(c)) {
22233         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22234     }
22235     else {
22236         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22237     }
22238 }
22239
22240 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22241
22242 STATIC void
22243 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22244 {
22245     /* Appends to 'sv' a displayable version of the range of code points from
22246      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
22247      * that have them, when they occur at the beginning or end of the range.
22248      * It uses hex to output the remaining code points, unless 'allow_literals'
22249      * is true, in which case the printable ASCII ones are output as-is (though
22250      * some of these will be escaped by put_code_point()).
22251      *
22252      * NOTE:  This is designed only for printing ranges of code points that fit
22253      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
22254      */
22255
22256     const unsigned int min_range_count = 3;
22257
22258     assert(start <= end);
22259
22260     PERL_ARGS_ASSERT_PUT_RANGE;
22261
22262     while (start <= end) {
22263         UV this_end;
22264         const char * format;
22265
22266         if (    end - start < min_range_count
22267             && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
22268         {
22269             /* Output a range of 1 or 2 chars individually, or longer ranges
22270              * when printable */
22271             for (; start <= end; start++) {
22272                 put_code_point(sv, start);
22273             }
22274             break;
22275         }
22276
22277         /* If permitted by the input options, and there is a possibility that
22278          * this range contains a printable literal, look to see if there is
22279          * one. */
22280         if (allow_literals && start <= MAX_PRINT_A) {
22281
22282             /* If the character at the beginning of the range isn't an ASCII
22283              * printable, effectively split the range into two parts:
22284              *  1) the portion before the first such printable,
22285              *  2) the rest
22286              * and output them separately. */
22287             if (! isPRINT_A(start)) {
22288                 UV temp_end = start + 1;
22289
22290                 /* There is no point looking beyond the final possible
22291                  * printable, in MAX_PRINT_A */
22292                 UV max = MIN(end, MAX_PRINT_A);
22293
22294                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22295                     temp_end++;
22296                 }
22297
22298                 /* Here, temp_end points to one beyond the first printable if
22299                  * found, or to one beyond 'max' if not.  If none found, make
22300                  * sure that we use the entire range */
22301                 if (temp_end > MAX_PRINT_A) {
22302                     temp_end = end + 1;
22303                 }
22304
22305                 /* Output the first part of the split range: the part that
22306                  * doesn't have printables, with the parameter set to not look
22307                  * for literals (otherwise we would infinitely recurse) */
22308                 put_range(sv, start, temp_end - 1, FALSE);
22309
22310                 /* The 2nd part of the range (if any) starts here. */
22311                 start = temp_end;
22312
22313                 /* We do a continue, instead of dropping down, because even if
22314                  * the 2nd part is non-empty, it could be so short that we want
22315                  * to output it as individual characters, as tested for at the
22316                  * top of this loop.  */
22317                 continue;
22318             }
22319
22320             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
22321              * output a sub-range of just the digits or letters, then process
22322              * the remaining portion as usual. */
22323             if (isALPHANUMERIC_A(start)) {
22324                 UV mask = (isDIGIT_A(start))
22325                            ? _CC_DIGIT
22326                              : isUPPER_A(start)
22327                                ? _CC_UPPER
22328                                : _CC_LOWER;
22329                 UV temp_end = start + 1;
22330
22331                 /* Find the end of the sub-range that includes just the
22332                  * characters in the same class as the first character in it */
22333                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22334                     temp_end++;
22335                 }
22336                 temp_end--;
22337
22338                 /* For short ranges, don't duplicate the code above to output
22339                  * them; just call recursively */
22340                 if (temp_end - start < min_range_count) {
22341                     put_range(sv, start, temp_end, FALSE);
22342                 }
22343                 else {  /* Output as a range */
22344                     put_code_point(sv, start);
22345                     sv_catpvs(sv, "-");
22346                     put_code_point(sv, temp_end);
22347                 }
22348                 start = temp_end + 1;
22349                 continue;
22350             }
22351
22352             /* We output any other printables as individual characters */
22353             if (isPUNCT_A(start) || isSPACE_A(start)) {
22354                 while (start <= end && (isPUNCT_A(start)
22355                                         || isSPACE_A(start)))
22356                 {
22357                     put_code_point(sv, start);
22358                     start++;
22359                 }
22360                 continue;
22361             }
22362         } /* End of looking for literals */
22363
22364         /* Here is not to output as a literal.  Some control characters have
22365          * mnemonic names.  Split off any of those at the beginning and end of
22366          * the range to print mnemonically.  It isn't possible for many of
22367          * these to be in a row, so this won't overwhelm with output */
22368         if (   start <= end
22369             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22370         {
22371             while (isMNEMONIC_CNTRL(start) && start <= end) {
22372                 put_code_point(sv, start);
22373                 start++;
22374             }
22375
22376             /* If this didn't take care of the whole range ... */
22377             if (start <= end) {
22378
22379                 /* Look backwards from the end to find the final non-mnemonic
22380                  * */
22381                 UV temp_end = end;
22382                 while (isMNEMONIC_CNTRL(temp_end)) {
22383                     temp_end--;
22384                 }
22385
22386                 /* And separately output the interior range that doesn't start
22387                  * or end with mnemonics */
22388                 put_range(sv, start, temp_end, FALSE);
22389
22390                 /* Then output the mnemonic trailing controls */
22391                 start = temp_end + 1;
22392                 while (start <= end) {
22393                     put_code_point(sv, start);
22394                     start++;
22395                 }
22396                 break;
22397             }
22398         }
22399
22400         /* As a final resort, output the range or subrange as hex. */
22401
22402         if (start >= NUM_ANYOF_CODE_POINTS) {
22403             this_end = end;
22404         }
22405         else {  /* Have to split range at the bitmap boundary */
22406             this_end = (end < NUM_ANYOF_CODE_POINTS)
22407                         ? end
22408                         : NUM_ANYOF_CODE_POINTS - 1;
22409         }
22410 #if NUM_ANYOF_CODE_POINTS > 256
22411         format = (this_end < 256)
22412                  ? "\\x%02" UVXf "-\\x%02" UVXf
22413                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22414 #else
22415         format = "\\x%02" UVXf "-\\x%02" UVXf;
22416 #endif
22417         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22418         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22419         GCC_DIAG_RESTORE_STMT;
22420         break;
22421     }
22422 }
22423
22424 STATIC void
22425 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22426 {
22427     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22428      * 'invlist' */
22429
22430     UV start, end;
22431     bool allow_literals = TRUE;
22432
22433     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22434
22435     /* Generally, it is more readable if printable characters are output as
22436      * literals, but if a range (nearly) spans all of them, it's best to output
22437      * it as a single range.  This code will use a single range if all but 2
22438      * ASCII printables are in it */
22439     invlist_iterinit(invlist);
22440     while (invlist_iternext(invlist, &start, &end)) {
22441
22442         /* If the range starts beyond the final printable, it doesn't have any
22443          * in it */
22444         if (start > MAX_PRINT_A) {
22445             break;
22446         }
22447
22448         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
22449          * all but two, the range must start and end no later than 2 from
22450          * either end */
22451         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22452             if (end > MAX_PRINT_A) {
22453                 end = MAX_PRINT_A;
22454             }
22455             if (start < ' ') {
22456                 start = ' ';
22457             }
22458             if (end - start >= MAX_PRINT_A - ' ' - 2) {
22459                 allow_literals = FALSE;
22460             }
22461             break;
22462         }
22463     }
22464     invlist_iterfinish(invlist);
22465
22466     /* Here we have figured things out.  Output each range */
22467     invlist_iterinit(invlist);
22468     while (invlist_iternext(invlist, &start, &end)) {
22469         if (start >= NUM_ANYOF_CODE_POINTS) {
22470             break;
22471         }
22472         put_range(sv, start, end, allow_literals);
22473     }
22474     invlist_iterfinish(invlist);
22475
22476     return;
22477 }
22478
22479 STATIC SV*
22480 S_put_charclass_bitmap_innards_common(pTHX_
22481         SV* invlist,            /* The bitmap */
22482         SV* posixes,            /* Under /l, things like [:word:], \S */
22483         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
22484         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
22485         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
22486         const bool invert       /* Is the result to be inverted? */
22487 )
22488 {
22489     /* Create and return an SV containing a displayable version of the bitmap
22490      * and associated information determined by the input parameters.  If the
22491      * output would have been only the inversion indicator '^', NULL is instead
22492      * returned. */
22493
22494     SV * output;
22495
22496     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22497
22498     if (invert) {
22499         output = newSVpvs("^");
22500     }
22501     else {
22502         output = newSVpvs("");
22503     }
22504
22505     /* First, the code points in the bitmap that are unconditionally there */
22506     put_charclass_bitmap_innards_invlist(output, invlist);
22507
22508     /* Traditionally, these have been placed after the main code points */
22509     if (posixes) {
22510         sv_catsv(output, posixes);
22511     }
22512
22513     if (only_utf8 && _invlist_len(only_utf8)) {
22514         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22515         put_charclass_bitmap_innards_invlist(output, only_utf8);
22516     }
22517
22518     if (not_utf8 && _invlist_len(not_utf8)) {
22519         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22520         put_charclass_bitmap_innards_invlist(output, not_utf8);
22521     }
22522
22523     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22524         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22525         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22526
22527         /* This is the only list in this routine that can legally contain code
22528          * points outside the bitmap range.  The call just above to
22529          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22530          * output them here.  There's about a half-dozen possible, and none in
22531          * contiguous ranges longer than 2 */
22532         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22533             UV start, end;
22534             SV* above_bitmap = NULL;
22535
22536             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22537
22538             invlist_iterinit(above_bitmap);
22539             while (invlist_iternext(above_bitmap, &start, &end)) {
22540                 UV i;
22541
22542                 for (i = start; i <= end; i++) {
22543                     put_code_point(output, i);
22544                 }
22545             }
22546             invlist_iterfinish(above_bitmap);
22547             SvREFCNT_dec_NN(above_bitmap);
22548         }
22549     }
22550
22551     if (invert && SvCUR(output) == 1) {
22552         return NULL;
22553     }
22554
22555     return output;
22556 }
22557
22558 STATIC bool
22559 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22560                                      char *bitmap,
22561                                      SV *nonbitmap_invlist,
22562                                      SV *only_utf8_locale_invlist,
22563                                      const regnode * const node,
22564                                      const U8 flags,
22565                                      const bool force_as_is_display)
22566 {
22567     /* Appends to 'sv' a displayable version of the innards of the bracketed
22568      * character class defined by the other arguments:
22569      *  'bitmap' points to the bitmap, or NULL if to ignore that.
22570      *  'nonbitmap_invlist' is an inversion list of the code points that are in
22571      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
22572      *      none.  The reasons for this could be that they require some
22573      *      condition such as the target string being or not being in UTF-8
22574      *      (under /d), or because they came from a user-defined property that
22575      *      was not resolved at the time of the regex compilation (under /u)
22576      *  'only_utf8_locale_invlist' is an inversion list of the code points that
22577      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
22578      *  'node' is the regex pattern ANYOF node.  It is needed only when the
22579      *      above two parameters are not null, and is passed so that this
22580      *      routine can tease apart the various reasons for them.
22581      *  'flags' is the flags field of 'node'
22582      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
22583      *      to invert things to see if that leads to a cleaner display.  If
22584      *      FALSE, this routine is free to use its judgment about doing this.
22585      *
22586      * It returns TRUE if there was actually something output.  (It may be that
22587      * the bitmap, etc is empty.)
22588      *
22589      * When called for outputting the bitmap of a non-ANYOF node, just pass the
22590      * bitmap, with the succeeding parameters set to NULL, and the final one to
22591      * FALSE.
22592      */
22593
22594     /* In general, it tries to display the 'cleanest' representation of the
22595      * innards, choosing whether to display them inverted or not, regardless of
22596      * whether the class itself is to be inverted.  However,  there are some
22597      * cases where it can't try inverting, as what actually matches isn't known
22598      * until runtime, and hence the inversion isn't either. */
22599
22600     bool inverting_allowed = ! force_as_is_display;
22601
22602     int i;
22603     STRLEN orig_sv_cur = SvCUR(sv);
22604
22605     SV* invlist;            /* Inversion list we accumulate of code points that
22606                                are unconditionally matched */
22607     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
22608                                UTF-8 */
22609     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
22610                              */
22611     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
22612     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
22613                                        is UTF-8 */
22614
22615     SV* as_is_display;      /* The output string when we take the inputs
22616                                literally */
22617     SV* inverted_display;   /* The output string when we invert the inputs */
22618
22619     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
22620                                                    to match? */
22621     /* We are biased in favor of displaying things without them being inverted,
22622      * as that is generally easier to understand */
22623     const int bias = 5;
22624
22625     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22626
22627     /* Start off with whatever code points are passed in.  (We clone, so we
22628      * don't change the caller's list) */
22629     if (nonbitmap_invlist) {
22630         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22631         invlist = invlist_clone(nonbitmap_invlist, NULL);
22632     }
22633     else {  /* Worst case size is every other code point is matched */
22634         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22635     }
22636
22637     if (flags) {
22638         if (OP(node) == ANYOFD) {
22639
22640             /* This flag indicates that the code points below 0x100 in the
22641              * nonbitmap list are precisely the ones that match only when the
22642              * target is UTF-8 (they should all be non-ASCII). */
22643             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22644             {
22645                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22646                 _invlist_subtract(invlist, only_utf8, &invlist);
22647             }
22648
22649             /* And this flag for matching all non-ASCII 0xFF and below */
22650             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22651             {
22652                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22653             }
22654         }
22655         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22656
22657             /* If either of these flags are set, what matches isn't
22658              * determinable except during execution, so don't know enough here
22659              * to invert */
22660             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22661                 inverting_allowed = FALSE;
22662             }
22663
22664             /* What the posix classes match also varies at runtime, so these
22665              * will be output symbolically. */
22666             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22667                 int i;
22668
22669                 posixes = newSVpvs("");
22670                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22671                     if (ANYOF_POSIXL_TEST(node, i)) {
22672                         sv_catpv(posixes, anyofs[i]);
22673                     }
22674                 }
22675             }
22676         }
22677     }
22678
22679     /* Accumulate the bit map into the unconditional match list */
22680     if (bitmap) {
22681         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22682             if (BITMAP_TEST(bitmap, i)) {
22683                 int start = i++;
22684                 for (;
22685                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22686                      i++)
22687                 { /* empty */ }
22688                 invlist = _add_range_to_invlist(invlist, start, i-1);
22689             }
22690         }
22691     }
22692
22693     /* Make sure that the conditional match lists don't have anything in them
22694      * that match unconditionally; otherwise the output is quite confusing.
22695      * This could happen if the code that populates these misses some
22696      * duplication. */
22697     if (only_utf8) {
22698         _invlist_subtract(only_utf8, invlist, &only_utf8);
22699     }
22700     if (not_utf8) {
22701         _invlist_subtract(not_utf8, invlist, &not_utf8);
22702     }
22703
22704     if (only_utf8_locale_invlist) {
22705
22706         /* Since this list is passed in, we have to make a copy before
22707          * modifying it */
22708         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22709
22710         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22711
22712         /* And, it can get really weird for us to try outputting an inverted
22713          * form of this list when it has things above the bitmap, so don't even
22714          * try */
22715         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22716             inverting_allowed = FALSE;
22717         }
22718     }
22719
22720     /* Calculate what the output would be if we take the input as-is */
22721     as_is_display = put_charclass_bitmap_innards_common(invlist,
22722                                                     posixes,
22723                                                     only_utf8,
22724                                                     not_utf8,
22725                                                     only_utf8_locale,
22726                                                     invert);
22727
22728     /* If have to take the output as-is, just do that */
22729     if (! inverting_allowed) {
22730         if (as_is_display) {
22731             sv_catsv(sv, as_is_display);
22732             SvREFCNT_dec_NN(as_is_display);
22733         }
22734     }
22735     else { /* But otherwise, create the output again on the inverted input, and
22736               use whichever version is shorter */
22737
22738         int inverted_bias, as_is_bias;
22739
22740         /* We will apply our bias to whichever of the results doesn't have
22741          * the '^' */
22742         if (invert) {
22743             invert = FALSE;
22744             as_is_bias = bias;
22745             inverted_bias = 0;
22746         }
22747         else {
22748             invert = TRUE;
22749             as_is_bias = 0;
22750             inverted_bias = bias;
22751         }
22752
22753         /* Now invert each of the lists that contribute to the output,
22754          * excluding from the result things outside the possible range */
22755
22756         /* For the unconditional inversion list, we have to add in all the
22757          * conditional code points, so that when inverted, they will be gone
22758          * from it */
22759         _invlist_union(only_utf8, invlist, &invlist);
22760         _invlist_union(not_utf8, invlist, &invlist);
22761         _invlist_union(only_utf8_locale, invlist, &invlist);
22762         _invlist_invert(invlist);
22763         _invlist_intersection(invlist, PL_InBitmap, &invlist);
22764
22765         if (only_utf8) {
22766             _invlist_invert(only_utf8);
22767             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22768         }
22769         else if (not_utf8) {
22770
22771             /* If a code point matches iff the target string is not in UTF-8,
22772              * then complementing the result has it not match iff not in UTF-8,
22773              * which is the same thing as matching iff it is UTF-8. */
22774             only_utf8 = not_utf8;
22775             not_utf8 = NULL;
22776         }
22777
22778         if (only_utf8_locale) {
22779             _invlist_invert(only_utf8_locale);
22780             _invlist_intersection(only_utf8_locale,
22781                                   PL_InBitmap,
22782                                   &only_utf8_locale);
22783         }
22784
22785         inverted_display = put_charclass_bitmap_innards_common(
22786                                             invlist,
22787                                             posixes,
22788                                             only_utf8,
22789                                             not_utf8,
22790                                             only_utf8_locale, invert);
22791
22792         /* Use the shortest representation, taking into account our bias
22793          * against showing it inverted */
22794         if (   inverted_display
22795             && (   ! as_is_display
22796                 || (  SvCUR(inverted_display) + inverted_bias
22797                     < SvCUR(as_is_display)    + as_is_bias)))
22798         {
22799             sv_catsv(sv, inverted_display);
22800         }
22801         else if (as_is_display) {
22802             sv_catsv(sv, as_is_display);
22803         }
22804
22805         SvREFCNT_dec(as_is_display);
22806         SvREFCNT_dec(inverted_display);
22807     }
22808
22809     SvREFCNT_dec_NN(invlist);
22810     SvREFCNT_dec(only_utf8);
22811     SvREFCNT_dec(not_utf8);
22812     SvREFCNT_dec(posixes);
22813     SvREFCNT_dec(only_utf8_locale);
22814
22815     return SvCUR(sv) > orig_sv_cur;
22816 }
22817
22818 #define CLEAR_OPTSTART                                                       \
22819     if (optstart) STMT_START {                                               \
22820         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
22821                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22822         optstart=NULL;                                                       \
22823     } STMT_END
22824
22825 #define DUMPUNTIL(b,e)                                                       \
22826                     CLEAR_OPTSTART;                                          \
22827                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22828
22829 STATIC const regnode *
22830 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22831             const regnode *last, const regnode *plast,
22832             SV* sv, I32 indent, U32 depth)
22833 {
22834     U8 op = PSEUDO;     /* Arbitrary non-END op. */
22835     const regnode *next;
22836     const regnode *optstart= NULL;
22837
22838     RXi_GET_DECL(r, ri);
22839     DECLARE_AND_GET_RE_DEBUG_FLAGS;
22840
22841     PERL_ARGS_ASSERT_DUMPUNTIL;
22842
22843 #ifdef DEBUG_DUMPUNTIL
22844     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22845         last ? last-start : 0, plast ? plast-start : 0);
22846 #endif
22847
22848     if (plast && plast < last)
22849         last= plast;
22850
22851     while (PL_regkind[op] != END && (!last || node < last)) {
22852         assert(node);
22853         /* While that wasn't END last time... */
22854         NODE_ALIGN(node);
22855         op = OP(node);
22856         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22857             indent--;
22858         next = regnext((regnode *)node);
22859
22860         /* Where, what. */
22861         if (OP(node) == OPTIMIZED) {
22862             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22863                 optstart = node;
22864             else
22865                 goto after_print;
22866         } else
22867             CLEAR_OPTSTART;
22868
22869         regprop(r, sv, node, NULL, NULL);
22870         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22871                       (int)(2*indent + 1), "", SvPVX_const(sv));
22872
22873         if (OP(node) != OPTIMIZED) {
22874             if (next == NULL)           /* Next ptr. */
22875                 Perl_re_printf( aTHX_  " (0)");
22876             else if (PL_regkind[(U8)op] == BRANCH
22877                      && PL_regkind[OP(next)] != BRANCH )
22878                 Perl_re_printf( aTHX_  " (FAIL)");
22879             else
22880                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22881             Perl_re_printf( aTHX_ "\n");
22882         }
22883
22884       after_print:
22885         if (PL_regkind[(U8)op] == BRANCHJ) {
22886             assert(next);
22887             {
22888                 const regnode *nnode = (OP(next) == LONGJMP
22889                                        ? regnext((regnode *)next)
22890                                        : next);
22891                 if (last && nnode > last)
22892                     nnode = last;
22893                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22894             }
22895         }
22896         else if (PL_regkind[(U8)op] == BRANCH) {
22897             assert(next);
22898             DUMPUNTIL(NEXTOPER(node), next);
22899         }
22900         else if ( PL_regkind[(U8)op]  == TRIE ) {
22901             const regnode *this_trie = node;
22902             const char op = OP(node);
22903             const U32 n = ARG(node);
22904             const reg_ac_data * const ac = op>=AHOCORASICK ?
22905                (reg_ac_data *)ri->data->data[n] :
22906                NULL;
22907             const reg_trie_data * const trie =
22908                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22909 #ifdef DEBUGGING
22910             AV *const trie_words
22911                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22912 #endif
22913             const regnode *nextbranch= NULL;
22914             I32 word_idx;
22915             SvPVCLEAR(sv);
22916             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22917                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22918
22919                 Perl_re_indentf( aTHX_  "%s ",
22920                     indent+3,
22921                     elem_ptr
22922                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22923                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22924                                 PL_colors[0], PL_colors[1],
22925                                 (SvUTF8(*elem_ptr)
22926                                  ? PERL_PV_ESCAPE_UNI
22927                                  : 0)
22928                                 | PERL_PV_PRETTY_ELLIPSES
22929                                 | PERL_PV_PRETTY_LTGT
22930                             )
22931                     : "???"
22932                 );
22933                 if (trie->jump) {
22934                     U16 dist= trie->jump[word_idx+1];
22935                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22936                                (UV)((dist ? this_trie + dist : next) - start));
22937                     if (dist) {
22938                         if (!nextbranch)
22939                             nextbranch= this_trie + trie->jump[0];
22940                         DUMPUNTIL(this_trie + dist, nextbranch);
22941                     }
22942                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22943                         nextbranch= regnext((regnode *)nextbranch);
22944                 } else {
22945                     Perl_re_printf( aTHX_  "\n");
22946                 }
22947             }
22948             if (last && next > last)
22949                 node= last;
22950             else
22951                 node= next;
22952         }
22953         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22954             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22955                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22956         }
22957         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22958             assert(next);
22959             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22960         }
22961         else if ( op == PLUS || op == STAR) {
22962             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22963         }
22964         else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22965             /* Literal string, where present. */
22966             node += NODE_SZ_STR(node) - 1;
22967             node = NEXTOPER(node);
22968         }
22969         else {
22970             node = NEXTOPER(node);
22971             node += regarglen[(U8)op];
22972         }
22973         if (op == CURLYX || op == OPEN || op == SROPEN)
22974             indent++;
22975     }
22976     CLEAR_OPTSTART;
22977 #ifdef DEBUG_DUMPUNTIL
22978     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22979 #endif
22980     return node;
22981 }
22982
22983 #endif  /* DEBUGGING */
22984
22985 #ifndef PERL_IN_XSUB_RE
22986
22987 #  include "uni_keywords.h"
22988
22989 void
22990 Perl_init_uniprops(pTHX)
22991 {
22992
22993 #  ifdef DEBUGGING
22994     char * dump_len_string;
22995
22996     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22997     if (   ! dump_len_string
22998         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22999     {
23000         PL_dump_re_max_len = 60;    /* A reasonable default */
23001     }
23002 #  endif
23003
23004     PL_user_def_props = newHV();
23005
23006 #  ifdef USE_ITHREADS
23007
23008     HvSHAREKEYS_off(PL_user_def_props);
23009     PL_user_def_props_aTHX = aTHX;
23010
23011 #  endif
23012
23013     /* Set up the inversion list interpreter-level variables */
23014
23015     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23016     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
23017     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
23018     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23019     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23020     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23021     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23022     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23023     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23024     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23025     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23026     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23027     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23028     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23029     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23030     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23031
23032     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23033     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23034     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23035     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23036     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23037     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23038     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23039     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23040     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23041     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23042     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23043     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23044     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23045     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23046     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23047     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23048
23049     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23050     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23051     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23052     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23053     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23054
23055     PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23056     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23057     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23058     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23059
23060     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23061
23062     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23063     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23064
23065     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23066     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23067
23068     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23069     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23070                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23071     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23072                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23073     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23074     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23075     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23076     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23077     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23078     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23079     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23080     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23081     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23082
23083 #  ifdef UNI_XIDC
23084     /* The below are used only by deprecated functions.  They could be removed */
23085     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23086     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23087     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23088 #  endif
23089 }
23090
23091 /* These four functions are compiled only in regcomp.c, where they have access
23092  * to the data they return.  They are a way for re_comp.c to get access to that
23093  * data without having to compile the whole data structures. */
23094
23095 I16
23096 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23097 {
23098     PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23099
23100     return match_uniprop((U8 *) key, key_len);
23101 }
23102
23103 SV *
23104 Perl_get_prop_definition(pTHX_ const int table_index)
23105 {
23106     PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23107
23108     /* Create and return the inversion list */
23109     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23110 }
23111
23112 const char * const *
23113 Perl_get_prop_values(const int table_index)
23114 {
23115     PERL_ARGS_ASSERT_GET_PROP_VALUES;
23116
23117     return UNI_prop_value_ptrs[table_index];
23118 }
23119
23120 const char *
23121 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23122 {
23123     PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23124
23125     return deprecated_property_msgs[warning_offset];
23126 }
23127
23128 #  if 0
23129
23130 This code was mainly added for backcompat to give a warning for non-portable
23131 code points in user-defined properties.  But experiments showed that the
23132 warning in earlier perls were only omitted on overflow, which should be an
23133 error, so there really isnt a backcompat issue, and actually adding the
23134 warning when none was present before might cause breakage, for little gain.  So
23135 khw left this code in, but not enabled.  Tests were never added.
23136
23137 embed.fnc entry:
23138 Ei      |const char *|get_extended_utf8_msg|const UV cp
23139
23140 PERL_STATIC_INLINE const char *
23141 S_get_extended_utf8_msg(pTHX_ const UV cp)
23142 {
23143     U8 dummy[UTF8_MAXBYTES + 1];
23144     HV *msgs;
23145     SV **msg;
23146
23147     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23148                              &msgs);
23149
23150     msg = hv_fetchs(msgs, "text", 0);
23151     assert(msg);
23152
23153     (void) sv_2mortal((SV *) msgs);
23154
23155     return SvPVX(*msg);
23156 }
23157
23158 #  endif
23159 #endif /* end of ! PERL_IN_XSUB_RE */
23160
23161 STATIC REGEXP *
23162 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23163                          const bool ignore_case)
23164 {
23165     /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23166      * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
23167      * because nothing outside of ASCII will match.  Use /m because the input
23168      * string may be a bunch of lines strung together.
23169      *
23170      * Also sets up the debugging info */
23171
23172     U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23173     U32 rx_flags;
23174     SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23175     REGEXP * subpattern_re;
23176     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23177
23178     PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23179
23180     if (ignore_case) {
23181         flags |= PMf_FOLD;
23182     }
23183     set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23184
23185     /* Like in op.c, we copy the compile time pm flags to the rx ones */
23186     rx_flags = flags & RXf_PMf_COMPILETIME;
23187
23188 #ifndef PERL_IN_XSUB_RE
23189     /* Use the core engine if this file is regcomp.c.  That means no
23190      * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23191     subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23192                                              &PL_core_reg_engine,
23193                                              NULL, NULL,
23194                                              rx_flags, flags);
23195 #else
23196     if (isDEBUG_WILDCARD) {
23197         /* Use the special debugging engine if this file is re_comp.c and wants
23198          * to output the wildcard matching.  This uses whatever
23199          * 'use re "Debug ..." is in effect */
23200         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23201                                                  &my_reg_engine,
23202                                                  NULL, NULL,
23203                                                  rx_flags, flags);
23204     }
23205     else {
23206         /* Use the special wildcard engine if this file is re_comp.c and
23207          * doesn't want to output the wildcard matching.  This uses whatever
23208          * 'use re "Debug ..." is in effect for compilation, but this engine
23209          * structure has been set up so that it uses the core engine for
23210          * execution, so no execution debugging as a result of re.pm will be
23211          * displayed. */
23212         subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23213                                                  &wild_reg_engine,
23214                                                  NULL, NULL,
23215                                                  rx_flags, flags);
23216         /* XXX The above has the effect that any user-supplied regex engine
23217          * won't be called for matching wildcards.  That might be good, or bad.
23218          * It could be changed in several ways.  The reason it is done the
23219          * current way is to avoid having to save and restore
23220          * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
23221          * could be used.  Another suggestion is to keep the authoritative
23222          * value of the debug flags in a thread-local variable and add set/get
23223          * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23224          * Still another is to pass a flag, say in the engine's intflags that
23225          * would be checked each time before doing the debug output */
23226     }
23227 #endif
23228
23229     assert(subpattern_re);  /* Should have died if didn't compile successfully */
23230     return subpattern_re;
23231 }
23232
23233 STATIC I32
23234 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23235          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23236 {
23237     I32 result;
23238     DECLARE_AND_GET_RE_DEBUG_FLAGS;
23239
23240     PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23241
23242     ENTER;
23243
23244     /* The compilation has set things up so that if the program doesn't want to
23245      * see the wildcard matching procedure, it will get the core execution
23246      * engine, which is subject only to -Dr.  So we have to turn that off
23247      * around this procedure */
23248     if (! isDEBUG_WILDCARD) {
23249         /* Note! Casts away 'volatile' */
23250         SAVEI32(PL_debug);
23251         PL_debug &= ~ DEBUG_r_FLAG;
23252     }
23253
23254     result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23255                          NULL, nosave);
23256     LEAVE;
23257
23258     return result;
23259 }
23260
23261 SV *
23262 S_handle_user_defined_property(pTHX_
23263
23264     /* Parses the contents of a user-defined property definition; returning the
23265      * expanded definition if possible.  If so, the return is an inversion
23266      * list.
23267      *
23268      * If there are subroutines that are part of the expansion and which aren't
23269      * known at the time of the call to this function, this returns what
23270      * parse_uniprop_string() returned for the first one encountered.
23271      *
23272      * If an error was found, NULL is returned, and 'msg' gets a suitable
23273      * message appended to it.  (Appending allows the back trace of how we got
23274      * to the faulty definition to be displayed through nested calls of
23275      * user-defined subs.)
23276      *
23277      * The caller IS responsible for freeing any returned SV.
23278      *
23279      * The syntax of the contents is pretty much described in perlunicode.pod,
23280      * but we also allow comments on each line */
23281
23282     const char * name,          /* Name of property */
23283     const STRLEN name_len,      /* The name's length in bytes */
23284     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23285     const bool to_fold,         /* ? Is this under /i */
23286     const bool runtime,         /* ? Are we in compile- or run-time */
23287     const bool deferrable,      /* Is it ok for this property's full definition
23288                                    to be deferred until later? */
23289     SV* contents,               /* The property's definition */
23290     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
23291                                    getting called unless this is thought to be
23292                                    a user-defined property */
23293     SV * msg,                   /* Any error or warning msg(s) are appended to
23294                                    this */
23295     const STRLEN level)         /* Recursion level of this call */
23296 {
23297     STRLEN len;
23298     const char * string         = SvPV_const(contents, len);
23299     const char * const e        = string + len;
23300     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23301     const STRLEN msgs_length_on_entry = SvCUR(msg);
23302
23303     const char * s0 = string;   /* Points to first byte in the current line
23304                                    being parsed in 'string' */
23305     const char overflow_msg[] = "Code point too large in \"";
23306     SV* running_definition = NULL;
23307
23308     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23309
23310     *user_defined_ptr = TRUE;
23311
23312     /* Look at each line */
23313     while (s0 < e) {
23314         const char * s;     /* Current byte */
23315         char op = '+';      /* Default operation is 'union' */
23316         IV   min = 0;       /* range begin code point */
23317         IV   max = -1;      /* and range end */
23318         SV* this_definition;
23319
23320         /* Skip comment lines */
23321         if (*s0 == '#') {
23322             s0 = strchr(s0, '\n');
23323             if (s0 == NULL) {
23324                 break;
23325             }
23326             s0++;
23327             continue;
23328         }
23329
23330         /* For backcompat, allow an empty first line */
23331         if (*s0 == '\n') {
23332             s0++;
23333             continue;
23334         }
23335
23336         /* First character in the line may optionally be the operation */
23337         if (   *s0 == '+'
23338             || *s0 == '!'
23339             || *s0 == '-'
23340             || *s0 == '&')
23341         {
23342             op = *s0++;
23343         }
23344
23345         /* If the line is one or two hex digits separated by blank space, its
23346          * a range; otherwise it is either another user-defined property or an
23347          * error */
23348
23349         s = s0;
23350
23351         if (! isXDIGIT(*s)) {
23352             goto check_if_property;
23353         }
23354
23355         do { /* Each new hex digit will add 4 bits. */
23356             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23357                 s = strchr(s, '\n');
23358                 if (s == NULL) {
23359                     s = e;
23360                 }
23361                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23362                 sv_catpv(msg, overflow_msg);
23363                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23364                                      UTF8fARG(is_contents_utf8, s - s0, s0));
23365                 sv_catpvs(msg, "\"");
23366                 goto return_failure;
23367             }
23368
23369             /* Accumulate this digit into the value */
23370             min = (min << 4) + READ_XDIGIT(s);
23371         } while (isXDIGIT(*s));
23372
23373         while (isBLANK(*s)) { s++; }
23374
23375         /* We allow comments at the end of the line */
23376         if (*s == '#') {
23377             s = strchr(s, '\n');
23378             if (s == NULL) {
23379                 s = e;
23380             }
23381             s++;
23382         }
23383         else if (s < e && *s != '\n') {
23384             if (! isXDIGIT(*s)) {
23385                 goto check_if_property;
23386             }
23387
23388             /* Look for the high point of the range */
23389             max = 0;
23390             do {
23391                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23392                     s = strchr(s, '\n');
23393                     if (s == NULL) {
23394                         s = e;
23395                     }
23396                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23397                     sv_catpv(msg, overflow_msg);
23398                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23399                                       UTF8fARG(is_contents_utf8, s - s0, s0));
23400                     sv_catpvs(msg, "\"");
23401                     goto return_failure;
23402                 }
23403
23404                 max = (max << 4) + READ_XDIGIT(s);
23405             } while (isXDIGIT(*s));
23406
23407             while (isBLANK(*s)) { s++; }
23408
23409             if (*s == '#') {
23410                 s = strchr(s, '\n');
23411                 if (s == NULL) {
23412                     s = e;
23413                 }
23414             }
23415             else if (s < e && *s != '\n') {
23416                 goto check_if_property;
23417             }
23418         }
23419
23420         if (max == -1) {    /* The line only had one entry */
23421             max = min;
23422         }
23423         else if (max < min) {
23424             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23425             sv_catpvs(msg, "Illegal range in \"");
23426             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23427                                 UTF8fARG(is_contents_utf8, s - s0, s0));
23428             sv_catpvs(msg, "\"");
23429             goto return_failure;
23430         }
23431
23432 #  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
23433
23434         if (   UNICODE_IS_PERL_EXTENDED(min)
23435             || UNICODE_IS_PERL_EXTENDED(max))
23436         {
23437             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23438
23439             /* If both code points are non-portable, warn only on the lower
23440              * one. */
23441             sv_catpv(msg, get_extended_utf8_msg(
23442                                             (UNICODE_IS_PERL_EXTENDED(min))
23443                                             ? min : max));
23444             sv_catpvs(msg, " in \"");
23445             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23446                                  UTF8fARG(is_contents_utf8, s - s0, s0));
23447             sv_catpvs(msg, "\"");
23448         }
23449
23450 #  endif
23451
23452         /* Here, this line contains a legal range */
23453         this_definition = sv_2mortal(_new_invlist(2));
23454         this_definition = _add_range_to_invlist(this_definition, min, max);
23455         goto calculate;
23456
23457       check_if_property:
23458
23459         /* Here it isn't a legal range line.  See if it is a legal property
23460          * line.  First find the end of the meat of the line */
23461         s = strpbrk(s, "#\n");
23462         if (s == NULL) {
23463             s = e;
23464         }
23465
23466         /* Ignore trailing blanks in keeping with the requirements of
23467          * parse_uniprop_string() */
23468         s--;
23469         while (s > s0 && isBLANK_A(*s)) {
23470             s--;
23471         }
23472         s++;
23473
23474         this_definition = parse_uniprop_string(s0, s - s0,
23475                                                is_utf8, to_fold, runtime,
23476                                                deferrable,
23477                                                NULL,
23478                                                user_defined_ptr, msg,
23479                                                (name_len == 0)
23480                                                 ? level /* Don't increase level
23481                                                            if input is empty */
23482                                                 : level + 1
23483                                               );
23484         if (this_definition == NULL) {
23485             goto return_failure;    /* 'msg' should have had the reason
23486                                        appended to it by the above call */
23487         }
23488
23489         if (! is_invlist(this_definition)) {    /* Unknown at this time */
23490             return newSVsv(this_definition);
23491         }
23492
23493         if (*s != '\n') {
23494             s = strchr(s, '\n');
23495             if (s == NULL) {
23496                 s = e;
23497             }
23498         }
23499
23500       calculate:
23501
23502         switch (op) {
23503             case '+':
23504                 _invlist_union(running_definition, this_definition,
23505                                                         &running_definition);
23506                 break;
23507             case '-':
23508                 _invlist_subtract(running_definition, this_definition,
23509                                                         &running_definition);
23510                 break;
23511             case '&':
23512                 _invlist_intersection(running_definition, this_definition,
23513                                                         &running_definition);
23514                 break;
23515             case '!':
23516                 _invlist_union_complement_2nd(running_definition,
23517                                         this_definition, &running_definition);
23518                 break;
23519             default:
23520                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23521                                  __FILE__, __LINE__, op);
23522                 break;
23523         }
23524
23525         /* Position past the '\n' */
23526         s0 = s + 1;
23527     }   /* End of loop through the lines of 'contents' */
23528
23529     /* Here, we processed all the lines in 'contents' without error.  If we
23530      * didn't add any warnings, simply return success */
23531     if (msgs_length_on_entry == SvCUR(msg)) {
23532
23533         /* If the expansion was empty, the answer isn't nothing: its an empty
23534          * inversion list */
23535         if (running_definition == NULL) {
23536             running_definition = _new_invlist(1);
23537         }
23538
23539         return running_definition;
23540     }
23541
23542     /* Otherwise, add some explanatory text, but we will return success */
23543     goto return_msg;
23544
23545   return_failure:
23546     running_definition = NULL;
23547
23548   return_msg:
23549
23550     if (name_len > 0) {
23551         sv_catpvs(msg, " in expansion of ");
23552         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23553     }
23554
23555     return running_definition;
23556 }
23557
23558 /* As explained below, certain operations need to take place in the first
23559  * thread created.  These macros switch contexts */
23560 #  ifdef USE_ITHREADS
23561 #    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
23562                                         PerlInterpreter * save_aTHX = aTHX;
23563 #    define SWITCH_TO_GLOBAL_CONTEXT                                        \
23564                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23565 #    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
23566 #    define CUR_CONTEXT      aTHX
23567 #    define ORIGINAL_CONTEXT save_aTHX
23568 #  else
23569 #    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
23570 #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
23571 #    define RESTORE_CONTEXT                   NOOP
23572 #    define CUR_CONTEXT                       NULL
23573 #    define ORIGINAL_CONTEXT                  NULL
23574 #  endif
23575
23576 STATIC void
23577 S_delete_recursion_entry(pTHX_ void *key)
23578 {
23579     /* Deletes the entry used to detect recursion when expanding user-defined
23580      * properties.  This is a function so it can be set up to be called even if
23581      * the program unexpectedly quits */
23582
23583     SV ** current_entry;
23584     const STRLEN key_len = strlen((const char *) key);
23585     DECLARATION_FOR_GLOBAL_CONTEXT;
23586
23587     SWITCH_TO_GLOBAL_CONTEXT;
23588
23589     /* If the entry is one of these types, it is a permanent entry, and not the
23590      * one used to detect recursions.  This function should delete only the
23591      * recursion entry */
23592     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23593     if (     current_entry
23594         && ! is_invlist(*current_entry)
23595         && ! SvPOK(*current_entry))
23596     {
23597         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23598                                                                     G_DISCARD);
23599     }
23600
23601     RESTORE_CONTEXT;
23602 }
23603
23604 STATIC SV *
23605 S_get_fq_name(pTHX_
23606               const char * const name,    /* The first non-blank in the \p{}, \P{} */
23607               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
23608               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23609               const bool has_colon_colon
23610              )
23611 {
23612     /* Returns a mortal SV containing the fully qualified version of the input
23613      * name */
23614
23615     SV * fq_name;
23616
23617     fq_name = newSVpvs_flags("", SVs_TEMP);
23618
23619     /* Use the current package if it wasn't included in our input */
23620     if (! has_colon_colon) {
23621         const HV * pkg = (IN_PERL_COMPILETIME)
23622                          ? PL_curstash
23623                          : CopSTASH(PL_curcop);
23624         const char* pkgname = HvNAME(pkg);
23625
23626         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23627                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23628         sv_catpvs(fq_name, "::");
23629     }
23630
23631     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23632                          UTF8fARG(is_utf8, name_len, name));
23633     return fq_name;
23634 }
23635
23636 STATIC SV *
23637 S_parse_uniprop_string(pTHX_
23638
23639     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
23640      * now.  If so, the return is an inversion list.
23641      *
23642      * If the property is user-defined, it is a subroutine, which in turn
23643      * may call other subroutines.  This function will call the whole nest of
23644      * them to get the definition they return; if some aren't known at the time
23645      * of the call to this function, the fully qualified name of the highest
23646      * level sub is returned.  It is an error to call this function at runtime
23647      * without every sub defined.
23648      *
23649      * If an error was found, NULL is returned, and 'msg' gets a suitable
23650      * message appended to it.  (Appending allows the back trace of how we got
23651      * to the faulty definition to be displayed through nested calls of
23652      * user-defined subs.)
23653      *
23654      * The caller should NOT try to free any returned inversion list.
23655      *
23656      * Other parameters will be set on return as described below */
23657
23658     const char * const name,    /* The first non-blank in the \p{}, \P{} */
23659     Size_t name_len,            /* Its length in bytes, not including any
23660                                    trailing space */
23661     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23662     const bool to_fold,         /* ? Is this under /i */
23663     const bool runtime,         /* TRUE if this is being called at run time */
23664     const bool deferrable,      /* TRUE if it's ok for the definition to not be
23665                                    known at this call */
23666     AV ** strings,              /* To return string property values, like named
23667                                    sequences */
23668     bool *user_defined_ptr,     /* Upon return from this function it will be
23669                                    set to TRUE if any component is a
23670                                    user-defined property */
23671     SV * msg,                   /* Any error or warning msg(s) are appended to
23672                                    this */
23673     const STRLEN level)         /* Recursion level of this call */
23674 {
23675     char* lookup_name;          /* normalized name for lookup in our tables */
23676     unsigned lookup_len;        /* Its length */
23677     enum { Not_Strict = 0,      /* Some properties have stricter name */
23678            Strict,              /* normalization rules, which we decide */
23679            As_Is                /* upon based on parsing */
23680          } stricter = Not_Strict;
23681
23682     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23683      * (though it requires extra effort to download them from Unicode and
23684      * compile perl to know about them) */
23685     bool is_nv_type = FALSE;
23686
23687     unsigned int i, j = 0;
23688     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
23689     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
23690     int table_index = 0;    /* The entry number for this property in the table
23691                                of all Unicode property names */
23692     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
23693     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
23694                                    the normalized name in certain situations */
23695     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
23696                                    part of a package name */
23697     Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
23698     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
23699                                              property rather than a Unicode
23700                                              one. */
23701     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
23702                                      if an error.  If it is an inversion list,
23703                                      it is the definition.  Otherwise it is a
23704                                      string containing the fully qualified sub
23705                                      name of 'name' */
23706     SV * fq_name = NULL;        /* For user-defined properties, the fully
23707                                    qualified name */
23708     bool invert_return = FALSE; /* ? Do we need to complement the result before
23709                                      returning it */
23710     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23711                                        explicit utf8:: package that we strip
23712                                        off  */
23713     /* The expansion of properties that could be either user-defined or
23714      * official unicode ones is deferred until runtime, including a marker for
23715      * those that might be in the latter category.  This boolean indicates if
23716      * we've seen that marker.  If not, what we're parsing can't be such an
23717      * official Unicode property whose expansion was deferred */
23718     bool could_be_deferred_official = FALSE;
23719
23720     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23721
23722     /* The input will be normalized into 'lookup_name' */
23723     Newx(lookup_name, name_len, char);
23724     SAVEFREEPV(lookup_name);
23725
23726     /* Parse the input. */
23727     for (i = 0; i < name_len; i++) {
23728         char cur = name[i];
23729
23730         /* Most of the characters in the input will be of this ilk, being parts
23731          * of a name */
23732         if (isIDCONT_A(cur)) {
23733
23734             /* Case differences are ignored.  Our lookup routine assumes
23735              * everything is lowercase, so normalize to that */
23736             if (isUPPER_A(cur)) {
23737                 lookup_name[j++] = toLOWER_A(cur);
23738                 continue;
23739             }
23740
23741             if (cur == '_') { /* Don't include these in the normalized name */
23742                 continue;
23743             }
23744
23745             lookup_name[j++] = cur;
23746
23747             /* The first character in a user-defined name must be of this type.
23748              * */
23749             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23750                 could_be_user_defined = FALSE;
23751             }
23752
23753             continue;
23754         }
23755
23756         /* Here, the character is not something typically in a name,  But these
23757          * two types of characters (and the '_' above) can be freely ignored in
23758          * most situations.  Later it may turn out we shouldn't have ignored
23759          * them, and we have to reparse, but we don't have enough information
23760          * yet to make that decision */
23761         if (cur == '-' || isSPACE_A(cur)) {
23762             could_be_user_defined = FALSE;
23763             continue;
23764         }
23765
23766         /* An equals sign or single colon mark the end of the first part of
23767          * the property name */
23768         if (    cur == '='
23769             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23770         {
23771             lookup_name[j++] = '='; /* Treat the colon as an '=' */
23772             equals_pos = j; /* Note where it occurred in the input */
23773             could_be_user_defined = FALSE;
23774             break;
23775         }
23776
23777         /* If this looks like it is a marker we inserted at compile time,
23778          * set a flag and otherwise ignore it.  If it isn't in the final
23779          * position, keep it as it would have been user input. */
23780         if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23781             && ! deferrable
23782             &&   could_be_user_defined
23783             &&   i == name_len - 1)
23784         {
23785             name_len--;
23786             could_be_deferred_official = TRUE;
23787             continue;
23788         }
23789
23790         /* Otherwise, this character is part of the name. */
23791         lookup_name[j++] = cur;
23792
23793         /* Here it isn't a single colon, so if it is a colon, it must be a
23794          * double colon */
23795         if (cur == ':') {
23796
23797             /* A double colon should be a package qualifier.  We note its
23798              * position and continue.  Note that one could have
23799              *      pkg1::pkg2::...::foo
23800              * so that the position at the end of the loop will be just after
23801              * the final qualifier */
23802
23803             i++;
23804             non_pkg_begin = i + 1;
23805             lookup_name[j++] = ':';
23806             lun_non_pkg_begin = j;
23807         }
23808         else { /* Only word chars (and '::') can be in a user-defined name */
23809             could_be_user_defined = FALSE;
23810         }
23811     } /* End of parsing through the lhs of the property name (or all of it if
23812          no rhs) */
23813
23814 #  define STRLENs(s)  (sizeof("" s "") - 1)
23815
23816     /* If there is a single package name 'utf8::', it is ambiguous.  It could
23817      * be for a user-defined property, or it could be a Unicode property, as
23818      * all of them are considered to be for that package.  For the purposes of
23819      * parsing the rest of the property, strip it off */
23820     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23821         lookup_name +=  STRLENs("utf8::");
23822         j -=  STRLENs("utf8::");
23823         equals_pos -=  STRLENs("utf8::");
23824         stripped_utf8_pkg = TRUE;
23825     }
23826
23827     /* Here, we are either done with the whole property name, if it was simple;
23828      * or are positioned just after the '=' if it is compound. */
23829
23830     if (equals_pos >= 0) {
23831         assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23832
23833         /* Space immediately after the '=' is ignored */
23834         i++;
23835         for (; i < name_len; i++) {
23836             if (! isSPACE_A(name[i])) {
23837                 break;
23838             }
23839         }
23840
23841         /* Most punctuation after the equals indicates a subpattern, like
23842          * \p{foo=/bar/} */
23843         if (   isPUNCT_A(name[i])
23844             &&  name[i] != '-'
23845             &&  name[i] != '+'
23846             &&  name[i] != '_'
23847             &&  name[i] != '{'
23848                 /* A backslash means the real delimitter is the next character,
23849                  * but it must be punctuation */
23850             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23851         {
23852             bool special_property = memEQs(lookup_name, j - 1, "name")
23853                                  || memEQs(lookup_name, j - 1, "na");
23854             if (! special_property) {
23855                 /* Find the property.  The table includes the equals sign, so
23856                  * we use 'j' as-is */
23857                 table_index = do_uniprop_match(lookup_name, j);
23858             }
23859             if (special_property || table_index) {
23860                 REGEXP * subpattern_re;
23861                 char open = name[i++];
23862                 char close;
23863                 const char * pos_in_brackets;
23864                 const char * const * prop_values;
23865                 bool escaped = 0;
23866
23867                 /* Backslash => delimitter is the character following.  We
23868                  * already checked that it is punctuation */
23869                 if (open == '\\') {
23870                     open = name[i++];
23871                     escaped = 1;
23872                 }
23873
23874                 /* This data structure is constructed so that the matching
23875                  * closing bracket is 3 past its matching opening.  The second
23876                  * set of closing is so that if the opening is something like
23877                  * ']', the closing will be that as well.  Something similar is
23878                  * done in toke.c */
23879                 pos_in_brackets = memCHRs("([<)]>)]>", open);
23880                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23881
23882                 if (    i >= name_len
23883                     ||  name[name_len-1] != close
23884                     || (escaped && name[name_len-2] != '\\')
23885                         /* Also make sure that there are enough characters.
23886                          * e.g., '\\\' would show up incorrectly as legal even
23887                          * though it is too short */
23888                     || (SSize_t) (name_len - i - 1 - escaped) < 0)
23889                 {
23890                     sv_catpvs(msg, "Unicode property wildcard not terminated");
23891                     goto append_name_to_msg;
23892                 }
23893
23894                 Perl_ck_warner_d(aTHX_
23895                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23896                     "The Unicode property wildcards feature is experimental");
23897
23898                 if (special_property) {
23899                     const char * error_msg;
23900                     const char * revised_name = name + i;
23901                     Size_t revised_name_len = name_len - (i + 1 + escaped);
23902
23903                     /* Currently, the only 'special_property' is name, which we
23904                      * lookup in _charnames.pm */
23905
23906                     if (! load_charnames(newSVpvs("placeholder"),
23907                                          revised_name, revised_name_len,
23908                                          &error_msg))
23909                     {
23910                         sv_catpv(msg, error_msg);
23911                         goto append_name_to_msg;
23912                     }
23913
23914                     /* Farm this out to a function just to make the current
23915                      * function less unwieldy */
23916                     if (handle_names_wildcard(revised_name, revised_name_len,
23917                                               &prop_definition,
23918                                               strings))
23919                     {
23920                         return prop_definition;
23921                     }
23922
23923                     goto failed;
23924                 }
23925
23926                 prop_values = get_prop_values(table_index);
23927
23928                 /* Now create and compile the wildcard subpattern.  Use /i
23929                  * because the property values are supposed to match with case
23930                  * ignored. */
23931                 subpattern_re = compile_wildcard(name + i,
23932                                                  name_len - i - 1 - escaped,
23933                                                  TRUE /* /i */
23934                                                 );
23935
23936                 /* For each legal property value, see if the supplied pattern
23937                  * matches it. */
23938                 while (*prop_values) {
23939                     const char * const entry = *prop_values;
23940                     const Size_t len = strlen(entry);
23941                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23942
23943                     if (execute_wildcard(subpattern_re,
23944                                  (char *) entry,
23945                                  (char *) entry + len,
23946                                  (char *) entry, 0,
23947                                  entry_sv,
23948                                  0))
23949                     { /* Here, matched.  Add to the returned list */
23950                         Size_t total_len = j + len;
23951                         SV * sub_invlist = NULL;
23952                         char * this_string;
23953
23954                         /* We know this is a legal \p{property=value}.  Call
23955                          * the function to return the list of code points that
23956                          * match it */
23957                         Newxz(this_string, total_len + 1, char);
23958                         Copy(lookup_name, this_string, j, char);
23959                         my_strlcat(this_string, entry, total_len + 1);
23960                         SAVEFREEPV(this_string);
23961                         sub_invlist = parse_uniprop_string(this_string,
23962                                                            total_len,
23963                                                            is_utf8,
23964                                                            to_fold,
23965                                                            runtime,
23966                                                            deferrable,
23967                                                            NULL,
23968                                                            user_defined_ptr,
23969                                                            msg,
23970                                                            level + 1);
23971                         _invlist_union(prop_definition, sub_invlist,
23972                                        &prop_definition);
23973                     }
23974
23975                     prop_values++;  /* Next iteration, look at next propvalue */
23976                 } /* End of looking through property values; (the data
23977                      structure is terminated by a NULL ptr) */
23978
23979                 SvREFCNT_dec_NN(subpattern_re);
23980
23981                 if (prop_definition) {
23982                     return prop_definition;
23983                 }
23984
23985                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23986                 goto append_name_to_msg;
23987             }
23988
23989             /* Here's how khw thinks we should proceed to handle the properties
23990              * not yet done:    Bidi Mirroring Glyph        can map to ""
23991                                 Bidi Paired Bracket         can map to ""
23992                                 Case Folding  (both full and simple)
23993                                             Shouldn't /i be good enough for Full
23994                                 Decomposition Mapping
23995                                 Equivalent Unified Ideograph    can map to ""
23996                                 Lowercase Mapping  (both full and simple)
23997                                 NFKC Case Fold                  can map to ""
23998                                 Titlecase Mapping  (both full and simple)
23999                                 Uppercase Mapping  (both full and simple)
24000              * Handle these the same way Name is done, using say, _wild.pm, but
24001              * having both loose and full, like in charclass_invlists.h.
24002              * Perhaps move block and script to that as they are somewhat large
24003              * in charclass_invlists.h.
24004              * For properties where the default is the code point itself, such
24005              * as any of the case changing mappings, the string would otherwise
24006              * consist of all Unicode code points in UTF-8 strung together.
24007              * This would be impractical.  So instead, examine their compiled
24008              * pattern, looking at the ssc.  If none, reject the pattern as an
24009              * error.  Otherwise run the pattern against every code point in
24010              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
24011              * And it might be good to create an API to return the ssc.
24012              * Or handle them like the algorithmic names are done
24013              */
24014         } /* End of is a wildcard subppattern */
24015
24016         /* \p{name=...} is handled specially.  Instead of using the normal
24017          * mechanism involving charclass_invlists.h, it uses _charnames.pm
24018          * which has the necessary (huge) data accessible to it, and which
24019          * doesn't get loaded unless necessary.  The legal syntax for names is
24020          * somewhat different than other properties due both to the vagaries of
24021          * a few outlier official names, and the fact that only a few ASCII
24022          * characters are permitted in them */
24023         if (   memEQs(lookup_name, j - 1, "name")
24024             || memEQs(lookup_name, j - 1, "na"))
24025         {
24026             dSP;
24027             HV * table;
24028             SV * character;
24029             const char * error_msg;
24030             CV* lookup_loose;
24031             SV * character_name;
24032             STRLEN character_len;
24033             UV cp;
24034
24035             stricter = As_Is;
24036
24037             /* Since the RHS (after skipping initial space) is passed unchanged
24038              * to charnames, and there are different criteria for what are
24039              * legal characters in the name, just parse it here.  A character
24040              * name must begin with an ASCII alphabetic */
24041             if (! isALPHA(name[i])) {
24042                 goto failed;
24043             }
24044             lookup_name[j++] = name[i];
24045
24046             for (++i; i < name_len; i++) {
24047                 /* Official names can only be in the ASCII range, and only
24048                  * certain characters */
24049                 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24050                     goto failed;
24051                 }
24052                 lookup_name[j++] = name[i];
24053             }
24054
24055             /* Finished parsing, save the name into an SV */
24056             character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24057
24058             /* Make sure _charnames is loaded.  (The parameters give context
24059              * for any errors generated */
24060             table = load_charnames(character_name, name, name_len, &error_msg);
24061             if (table == NULL) {
24062                 sv_catpv(msg, error_msg);
24063                 goto append_name_to_msg;
24064             }
24065
24066             lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24067             if (! lookup_loose) {
24068                 Perl_croak(aTHX_
24069                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
24070             }
24071
24072             PUSHSTACKi(PERLSI_REGCOMP);
24073             ENTER ;
24074             SAVETMPS;
24075             save_re_context();
24076
24077             PUSHMARK(SP) ;
24078             XPUSHs(character_name);
24079             PUTBACK;
24080             call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24081
24082             SPAGAIN ;
24083
24084             character = POPs;
24085             SvREFCNT_inc_simple_void_NN(character);
24086
24087             PUTBACK ;
24088             FREETMPS ;
24089             LEAVE ;
24090             POPSTACK;
24091
24092             if (! SvOK(character)) {
24093                 goto failed;
24094             }
24095
24096             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24097             if (character_len == SvCUR(character)) {
24098                 prop_definition = add_cp_to_invlist(NULL, cp);
24099             }
24100             else {
24101                 AV * this_string;
24102
24103                 /* First of the remaining characters in the string. */
24104                 char * remaining = SvPVX(character) + character_len;
24105
24106                 if (strings == NULL) {
24107                     goto failed;    /* XXX Perhaps a specific msg instead, like
24108                                        'not available here' */
24109                 }
24110
24111                 if (*strings == NULL) {
24112                     *strings = newAV();
24113                 }
24114
24115                 this_string = newAV();
24116                 av_push(this_string, newSVuv(cp));
24117
24118                 do {
24119                     cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24120                     av_push(this_string, newSVuv(cp));
24121                     remaining += character_len;
24122                 } while (remaining < SvEND(character));
24123
24124                 av_push(*strings, (SV *) this_string);
24125             }
24126
24127             return prop_definition;
24128         }
24129
24130         /* Certain properties whose values are numeric need special handling.
24131          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
24132          * purposes of checking if this is one of those properties */
24133         if (memBEGINPs(lookup_name, j, "is")) {
24134             lookup_offset = 2;
24135         }
24136
24137         /* Then check if it is one of these specially-handled properties.  The
24138          * possibilities are hard-coded because easier this way, and the list
24139          * is unlikely to change.
24140          *
24141          * All numeric value type properties are of this ilk, and are also
24142          * special in a different way later on.  So find those first.  There
24143          * are several numeric value type properties in the Unihan DB (which is
24144          * unlikely to be compiled with perl, but we handle it here in case it
24145          * does get compiled).  They all end with 'numeric'.  The interiors
24146          * aren't checked for the precise property.  This would stop working if
24147          * a cjk property were to be created that ended with 'numeric' and
24148          * wasn't a numeric type */
24149         is_nv_type = memEQs(lookup_name + lookup_offset,
24150                        j - 1 - lookup_offset, "numericvalue")
24151                   || memEQs(lookup_name + lookup_offset,
24152                       j - 1 - lookup_offset, "nv")
24153                   || (   memENDPs(lookup_name + lookup_offset,
24154                             j - 1 - lookup_offset, "numeric")
24155                       && (   memBEGINPs(lookup_name + lookup_offset,
24156                                       j - 1 - lookup_offset, "cjk")
24157                           || memBEGINPs(lookup_name + lookup_offset,
24158                                       j - 1 - lookup_offset, "k")));
24159         if (   is_nv_type
24160             || memEQs(lookup_name + lookup_offset,
24161                       j - 1 - lookup_offset, "canonicalcombiningclass")
24162             || memEQs(lookup_name + lookup_offset,
24163                       j - 1 - lookup_offset, "ccc")
24164             || memEQs(lookup_name + lookup_offset,
24165                       j - 1 - lookup_offset, "age")
24166             || memEQs(lookup_name + lookup_offset,
24167                       j - 1 - lookup_offset, "in")
24168             || memEQs(lookup_name + lookup_offset,
24169                       j - 1 - lookup_offset, "presentin"))
24170         {
24171             unsigned int k;
24172
24173             /* Since the stuff after the '=' is a number, we can't throw away
24174              * '-' willy-nilly, as those could be a minus sign.  Other stricter
24175              * rules also apply.  However, these properties all can have the
24176              * rhs not be a number, in which case they contain at least one
24177              * alphabetic.  In those cases, the stricter rules don't apply.
24178              * But the numeric type properties can have the alphas [Ee] to
24179              * signify an exponent, and it is still a number with stricter
24180              * rules.  So look for an alpha that signifies not-strict */
24181             stricter = Strict;
24182             for (k = i; k < name_len; k++) {
24183                 if (   isALPHA_A(name[k])
24184                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24185                 {
24186                     stricter = Not_Strict;
24187                     break;
24188                 }
24189             }
24190         }
24191
24192         if (stricter) {
24193
24194             /* A number may have a leading '+' or '-'.  The latter is retained
24195              * */
24196             if (name[i] == '+') {
24197                 i++;
24198             }
24199             else if (name[i] == '-') {
24200                 lookup_name[j++] = '-';
24201                 i++;
24202             }
24203
24204             /* Skip leading zeros including single underscores separating the
24205              * zeros, or between the final leading zero and the first other
24206              * digit */
24207             for (; i < name_len - 1; i++) {
24208                 if (    name[i] != '0'
24209                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24210                 {
24211                     break;
24212                 }
24213             }
24214         }
24215     }
24216     else {  /* No '=' */
24217
24218        /* Only a few properties without an '=' should be parsed with stricter
24219         * rules.  The list is unlikely to change. */
24220         if (   memBEGINPs(lookup_name, j, "perl")
24221             && memNEs(lookup_name + 4, j - 4, "space")
24222             && memNEs(lookup_name + 4, j - 4, "word"))
24223         {
24224             stricter = Strict;
24225
24226             /* We set the inputs back to 0 and the code below will reparse,
24227              * using strict */
24228             i = j = 0;
24229         }
24230     }
24231
24232     /* Here, we have either finished the property, or are positioned to parse
24233      * the remainder, and we know if stricter rules apply.  Finish out, if not
24234      * already done */
24235     for (; i < name_len; i++) {
24236         char cur = name[i];
24237
24238         /* In all instances, case differences are ignored, and we normalize to
24239          * lowercase */
24240         if (isUPPER_A(cur)) {
24241             lookup_name[j++] = toLOWER(cur);
24242             continue;
24243         }
24244
24245         /* An underscore is skipped, but not under strict rules unless it
24246          * separates two digits */
24247         if (cur == '_') {
24248             if (    stricter
24249                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
24250                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24251             {
24252                 lookup_name[j++] = '_';
24253             }
24254             continue;
24255         }
24256
24257         /* Hyphens are skipped except under strict */
24258         if (cur == '-' && ! stricter) {
24259             continue;
24260         }
24261
24262         /* XXX Bug in documentation.  It says white space skipped adjacent to
24263          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
24264          * in a number */
24265         if (isSPACE_A(cur) && ! stricter) {
24266             continue;
24267         }
24268
24269         lookup_name[j++] = cur;
24270
24271         /* Unless this is a non-trailing slash, we are done with it */
24272         if (i >= name_len - 1 || cur != '/') {
24273             continue;
24274         }
24275
24276         slash_pos = j;
24277
24278         /* A slash in the 'numeric value' property indicates that what follows
24279          * is a denominator.  It can have a leading '+' and '0's that should be
24280          * skipped.  But we have never allowed a negative denominator, so treat
24281          * a minus like every other character.  (No need to rule out a second
24282          * '/', as that won't match anything anyway */
24283         if (is_nv_type) {
24284             i++;
24285             if (i < name_len && name[i] == '+') {
24286                 i++;
24287             }
24288
24289             /* Skip leading zeros including underscores separating digits */
24290             for (; i < name_len - 1; i++) {
24291                 if (   name[i] != '0'
24292                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24293                 {
24294                     break;
24295                 }
24296             }
24297
24298             /* Store the first real character in the denominator */
24299             if (i < name_len) {
24300                 lookup_name[j++] = name[i];
24301             }
24302         }
24303     }
24304
24305     /* Here are completely done parsing the input 'name', and 'lookup_name'
24306      * contains a copy, normalized.
24307      *
24308      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24309      * different from without the underscores.  */
24310     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
24311            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24312         && UNLIKELY(name[name_len-1] == '_'))
24313     {
24314         lookup_name[j++] = '&';
24315     }
24316
24317     /* If the original input began with 'In' or 'Is', it could be a subroutine
24318      * call to a user-defined property instead of a Unicode property name. */
24319     if (    name_len - non_pkg_begin > 2
24320         &&  name[non_pkg_begin+0] == 'I'
24321         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24322     {
24323         /* Names that start with In have different characterstics than those
24324          * that start with Is */
24325         if (name[non_pkg_begin+1] == 's') {
24326             starts_with_Is = TRUE;
24327         }
24328     }
24329     else {
24330         could_be_user_defined = FALSE;
24331     }
24332
24333     if (could_be_user_defined) {
24334         CV* user_sub;
24335
24336         /* If the user defined property returns the empty string, it could
24337          * easily be because the pattern is being compiled before the data it
24338          * actually needs to compile is available.  This could be argued to be
24339          * a bug in the perl code, but this is a change of behavior for Perl,
24340          * so we handle it.  This means that intentionally returning nothing
24341          * will not be resolved until runtime */
24342         bool empty_return = FALSE;
24343
24344         /* Here, the name could be for a user defined property, which are
24345          * implemented as subs. */
24346         user_sub = get_cvn_flags(name, name_len, 0);
24347         if (! user_sub) {
24348
24349             /* Here, the property name could be a user-defined one, but there
24350              * is no subroutine to handle it (as of now).   Defer handling it
24351              * until runtime.  Otherwise, a block defined by Unicode in a later
24352              * release would get the synonym InFoo added for it, and existing
24353              * code that used that name would suddenly break if it referred to
24354              * the property before the sub was declared.  See [perl #134146] */
24355             if (deferrable) {
24356                 goto definition_deferred;
24357             }
24358
24359             /* Here, we are at runtime, and didn't find the user property.  It
24360              * could be an official property, but only if no package was
24361              * specified, or just the utf8:: package. */
24362             if (could_be_deferred_official) {
24363                 lookup_name += lun_non_pkg_begin;
24364                 j -= lun_non_pkg_begin;
24365             }
24366             else if (! stripped_utf8_pkg) {
24367                 goto unknown_user_defined;
24368             }
24369
24370             /* Drop down to look up in the official properties */
24371         }
24372         else {
24373             const char insecure[] = "Insecure user-defined property";
24374
24375             /* Here, there is a sub by the correct name.  Normally we call it
24376              * to get the property definition */
24377             dSP;
24378             SV * user_sub_sv = MUTABLE_SV(user_sub);
24379             SV * error;     /* Any error returned by calling 'user_sub' */
24380             SV * key;       /* The key into the hash of user defined sub names
24381                              */
24382             SV * placeholder;
24383             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
24384
24385             /* How many times to retry when another thread is in the middle of
24386              * expanding the same definition we want */
24387             PERL_INT_FAST8_T retry_countdown = 10;
24388
24389             DECLARATION_FOR_GLOBAL_CONTEXT;
24390
24391             /* If we get here, we know this property is user-defined */
24392             *user_defined_ptr = TRUE;
24393
24394             /* We refuse to call a potentially tainted subroutine; returning an
24395              * error instead */
24396             if (TAINT_get) {
24397                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24398                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24399                 goto append_name_to_msg;
24400             }
24401
24402             /* In principal, we only call each subroutine property definition
24403              * once during the life of the program.  This guarantees that the
24404              * property definition never changes.  The results of the single
24405              * sub call are stored in a hash, which is used instead for future
24406              * references to this property.  The property definition is thus
24407              * immutable.  But, to allow the user to have a /i-dependent
24408              * definition, we call the sub once for non-/i, and once for /i,
24409              * should the need arise, passing the /i status as a parameter.
24410              *
24411              * We start by constructing the hash key name, consisting of the
24412              * fully qualified subroutine name, preceded by the /i status, so
24413              * that there is a key for /i and a different key for non-/i */
24414             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24415             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24416                                           non_pkg_begin != 0);
24417             sv_catsv(key, fq_name);
24418             sv_2mortal(key);
24419
24420             /* We only call the sub once throughout the life of the program
24421              * (with the /i, non-/i exception noted above).  That means the
24422              * hash must be global and accessible to all threads.  It is
24423              * created at program start-up, before any threads are created, so
24424              * is accessible to all children.  But this creates some
24425              * complications.
24426              *
24427              * 1) The keys can't be shared, or else problems arise; sharing is
24428              *    turned off at hash creation time
24429              * 2) All SVs in it are there for the remainder of the life of the
24430              *    program, and must be created in the same interpreter context
24431              *    as the hash, or else they will be freed from the wrong pool
24432              *    at global destruction time.  This is handled by switching to
24433              *    the hash's context to create each SV going into it, and then
24434              *    immediately switching back
24435              * 3) All accesses to the hash must be controlled by a mutex, to
24436              *    prevent two threads from getting an unstable state should
24437              *    they simultaneously be accessing it.  The code below is
24438              *    crafted so that the mutex is locked whenever there is an
24439              *    access and unlocked only when the next stable state is
24440              *    achieved.
24441              *
24442              * The hash stores either the definition of the property if it was
24443              * valid, or, if invalid, the error message that was raised.  We
24444              * use the type of SV to distinguish.
24445              *
24446              * There's also the need to guard against the definition expansion
24447              * from infinitely recursing.  This is handled by storing the aTHX
24448              * of the expanding thread during the expansion.  Again the SV type
24449              * is used to distinguish this from the other two cases.  If we
24450              * come to here and the hash entry for this property is our aTHX,
24451              * it means we have recursed, and the code assumes that we would
24452              * infinitely recurse, so instead stops and raises an error.
24453              * (Any recursion has always been treated as infinite recursion in
24454              * this feature.)
24455              *
24456              * If instead, the entry is for a different aTHX, it means that
24457              * that thread has gotten here first, and hasn't finished expanding
24458              * the definition yet.  We just have to wait until it is done.  We
24459              * sleep and retry a few times, returning an error if the other
24460              * thread doesn't complete. */
24461
24462           re_fetch:
24463             USER_PROP_MUTEX_LOCK;
24464
24465             /* If we have an entry for this key, the subroutine has already
24466              * been called once with this /i status. */
24467             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24468                                                    SvPVX(key), SvCUR(key), 0);
24469             if (saved_user_prop_ptr) {
24470
24471                 /* If the saved result is an inversion list, it is the valid
24472                  * definition of this property */
24473                 if (is_invlist(*saved_user_prop_ptr)) {
24474                     prop_definition = *saved_user_prop_ptr;
24475
24476                     /* The SV in the hash won't be removed until global
24477                      * destruction, so it is stable and we can unlock */
24478                     USER_PROP_MUTEX_UNLOCK;
24479
24480                     /* The caller shouldn't try to free this SV */
24481                     return prop_definition;
24482                 }
24483
24484                 /* Otherwise, if it is a string, it is the error message
24485                  * that was returned when we first tried to evaluate this
24486                  * property.  Fail, and append the message */
24487                 if (SvPOK(*saved_user_prop_ptr)) {
24488                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24489                     sv_catsv(msg, *saved_user_prop_ptr);
24490
24491                     /* The SV in the hash won't be removed until global
24492                      * destruction, so it is stable and we can unlock */
24493                     USER_PROP_MUTEX_UNLOCK;
24494
24495                     return NULL;
24496                 }
24497
24498                 assert(SvIOK(*saved_user_prop_ptr));
24499
24500                 /* Here, we have an unstable entry in the hash.  Either another
24501                  * thread is in the middle of expanding the property's
24502                  * definition, or we are ourselves recursing.  We use the aTHX
24503                  * in it to distinguish */
24504                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24505
24506                     /* Here, it's another thread doing the expanding.  We've
24507                      * looked as much as we are going to at the contents of the
24508                      * hash entry.  It's safe to unlock. */
24509                     USER_PROP_MUTEX_UNLOCK;
24510
24511                     /* Retry a few times */
24512                     if (retry_countdown-- > 0) {
24513                         PerlProc_sleep(1);
24514                         goto re_fetch;
24515                     }
24516
24517                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24518                     sv_catpvs(msg, "Timeout waiting for another thread to "
24519                                    "define");
24520                     goto append_name_to_msg;
24521                 }
24522
24523                 /* Here, we are recursing; don't dig any deeper */
24524                 USER_PROP_MUTEX_UNLOCK;
24525
24526                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24527                 sv_catpvs(msg,
24528                           "Infinite recursion in user-defined property");
24529                 goto append_name_to_msg;
24530             }
24531
24532             /* Here, this thread has exclusive control, and there is no entry
24533              * for this property in the hash.  So we have the go ahead to
24534              * expand the definition ourselves. */
24535
24536             PUSHSTACKi(PERLSI_REGCOMP);
24537             ENTER;
24538
24539             /* Create a temporary placeholder in the hash to detect recursion
24540              * */
24541             SWITCH_TO_GLOBAL_CONTEXT;
24542             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24543             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24544             RESTORE_CONTEXT;
24545
24546             /* Now that we have a placeholder, we can let other threads
24547              * continue */
24548             USER_PROP_MUTEX_UNLOCK;
24549
24550             /* Make sure the placeholder always gets destroyed */
24551             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24552
24553             PUSHMARK(SP);
24554             SAVETMPS;
24555
24556             /* Call the user's function, with the /i status as a parameter.
24557              * Note that we have gone to a lot of trouble to keep this call
24558              * from being within the locked mutex region. */
24559             XPUSHs(boolSV(to_fold));
24560             PUTBACK;
24561
24562             /* The following block was taken from swash_init().  Presumably
24563              * they apply to here as well, though we no longer use a swash --
24564              * khw */
24565             SAVEHINTS();
24566             save_re_context();
24567             /* We might get here via a subroutine signature which uses a utf8
24568              * parameter name, at which point PL_subname will have been set
24569              * but not yet used. */
24570             save_item(PL_subname);
24571
24572             /* G_SCALAR guarantees a single return value */
24573             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24574
24575             SPAGAIN;
24576
24577             error = ERRSV;
24578             if (TAINT_get || SvTRUE(error)) {
24579                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24580                 if (SvTRUE(error)) {
24581                     sv_catpvs(msg, "Error \"");
24582                     sv_catsv(msg, error);
24583                     sv_catpvs(msg, "\"");
24584                 }
24585                 if (TAINT_get) {
24586                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
24587                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24588                 }
24589
24590                 if (name_len > 0) {
24591                     sv_catpvs(msg, " in expansion of ");
24592                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24593                                                                   name_len,
24594                                                                   name));
24595                 }
24596
24597                 (void) POPs;
24598                 prop_definition = NULL;
24599             }
24600             else {
24601                 SV * contents = POPs;
24602
24603                 /* The contents is supposed to be the expansion of the property
24604                  * definition.  If the definition is deferrable, and we got an
24605                  * empty string back, set a flag to later defer it (after clean
24606                  * up below). */
24607                 if (      deferrable
24608                     && (! SvPOK(contents) || SvCUR(contents) == 0))
24609                 {
24610                         empty_return = TRUE;
24611                 }
24612                 else { /* Otherwise, call a function to check for valid syntax,
24613                           and handle it */
24614
24615                     prop_definition = handle_user_defined_property(
24616                                                     name, name_len,
24617                                                     is_utf8, to_fold, runtime,
24618                                                     deferrable,
24619                                                     contents, user_defined_ptr,
24620                                                     msg,
24621                                                     level);
24622                 }
24623             }
24624
24625             /* Here, we have the results of the expansion.  Delete the
24626              * placeholder, and if the definition is now known, replace it with
24627              * that definition.  We need exclusive access to the hash, and we
24628              * can't let anyone else in, between when we delete the placeholder
24629              * and add the permanent entry */
24630             USER_PROP_MUTEX_LOCK;
24631
24632             S_delete_recursion_entry(aTHX_ SvPVX(key));
24633
24634             if (    ! empty_return
24635                 && (! prop_definition || is_invlist(prop_definition)))
24636             {
24637                 /* If we got success we use the inversion list defining the
24638                  * property; otherwise use the error message */
24639                 SWITCH_TO_GLOBAL_CONTEXT;
24640                 (void) hv_store_ent(PL_user_def_props,
24641                                     key,
24642                                     ((prop_definition)
24643                                      ? newSVsv(prop_definition)
24644                                      : newSVsv(msg)),
24645                                     0);
24646                 RESTORE_CONTEXT;
24647             }
24648
24649             /* All done, and the hash now has a permanent entry for this
24650              * property.  Give up exclusive control */
24651             USER_PROP_MUTEX_UNLOCK;
24652
24653             FREETMPS;
24654             LEAVE;
24655             POPSTACK;
24656
24657             if (empty_return) {
24658                 goto definition_deferred;
24659             }
24660
24661             if (prop_definition) {
24662
24663                 /* If the definition is for something not known at this time,
24664                  * we toss it, and go return the main property name, as that's
24665                  * the one the user will be aware of */
24666                 if (! is_invlist(prop_definition)) {
24667                     SvREFCNT_dec_NN(prop_definition);
24668                     goto definition_deferred;
24669                 }
24670
24671                 sv_2mortal(prop_definition);
24672             }
24673
24674             /* And return */
24675             return prop_definition;
24676
24677         }   /* End of calling the subroutine for the user-defined property */
24678     }       /* End of it could be a user-defined property */
24679
24680     /* Here it wasn't a user-defined property that is known at this time.  See
24681      * if it is a Unicode property */
24682
24683     lookup_len = j;     /* This is a more mnemonic name than 'j' */
24684
24685     /* Get the index into our pointer table of the inversion list corresponding
24686      * to the property */
24687     table_index = do_uniprop_match(lookup_name, lookup_len);
24688
24689     /* If it didn't find the property ... */
24690     if (table_index == 0) {
24691
24692         /* Try again stripping off any initial 'Is'.  This is because we
24693          * promise that an initial Is is optional.  The same isn't true of
24694          * names that start with 'In'.  Those can match only blocks, and the
24695          * lookup table already has those accounted for.  The lookup table also
24696          * has already accounted for Perl extensions (without and = sign)
24697          * starting with 'i's'. */
24698         if (starts_with_Is && equals_pos >= 0) {
24699             lookup_name += 2;
24700             lookup_len -= 2;
24701             equals_pos -= 2;
24702             slash_pos -= 2;
24703
24704             table_index = do_uniprop_match(lookup_name, lookup_len);
24705         }
24706
24707         if (table_index == 0) {
24708             char * canonical;
24709
24710             /* Here, we didn't find it.  If not a numeric type property, and
24711              * can't be a user-defined one, it isn't a legal property */
24712             if (! is_nv_type) {
24713                 if (! could_be_user_defined) {
24714                     goto failed;
24715                 }
24716
24717                 /* Here, the property name is legal as a user-defined one.   At
24718                  * compile time, it might just be that the subroutine for that
24719                  * property hasn't been encountered yet, but at runtime, it's
24720                  * an error to try to use an undefined one */
24721                 if (! deferrable) {
24722                     goto unknown_user_defined;;
24723                 }
24724
24725                 goto definition_deferred;
24726             } /* End of isn't a numeric type property */
24727
24728             /* The numeric type properties need more work to decide.  What we
24729              * do is make sure we have the number in canonical form and look
24730              * that up. */
24731
24732             if (slash_pos < 0) {    /* No slash */
24733
24734                 /* When it isn't a rational, take the input, convert it to a
24735                  * NV, then create a canonical string representation of that
24736                  * NV. */
24737
24738                 NV value;
24739                 SSize_t value_len = lookup_len - equals_pos;
24740
24741                 /* Get the value */
24742                 if (   value_len <= 0
24743                     || my_atof3(lookup_name + equals_pos, &value,
24744                                 value_len)
24745                           != lookup_name + lookup_len)
24746                 {
24747                     goto failed;
24748                 }
24749
24750                 /* If the value is an integer, the canonical value is integral
24751                  * */
24752                 if (Perl_ceil(value) == value) {
24753                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24754                                             equals_pos, lookup_name, value);
24755                 }
24756                 else {  /* Otherwise, it is %e with a known precision */
24757                     char * exp_ptr;
24758
24759                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24760                                                 equals_pos, lookup_name,
24761                                                 PL_E_FORMAT_PRECISION, value);
24762
24763                     /* The exponent generated is expecting two digits, whereas
24764                      * %e on some systems will generate three.  Remove leading
24765                      * zeros in excess of 2 from the exponent.  We start
24766                      * looking for them after the '=' */
24767                     exp_ptr = strchr(canonical + equals_pos, 'e');
24768                     if (exp_ptr) {
24769                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24770                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24771
24772                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24773
24774                         if (excess_exponent_len > 0) {
24775                             SSize_t leading_zeros = strspn(cur_ptr, "0");
24776                             SSize_t excess_leading_zeros
24777                                     = MIN(leading_zeros, excess_exponent_len);
24778                             if (excess_leading_zeros > 0) {
24779                                 Move(cur_ptr + excess_leading_zeros,
24780                                      cur_ptr,
24781                                      strlen(cur_ptr) - excess_leading_zeros
24782                                        + 1,  /* Copy the NUL as well */
24783                                      char);
24784                             }
24785                         }
24786                     }
24787                 }
24788             }
24789             else {  /* Has a slash.  Create a rational in canonical form  */
24790                 UV numerator, denominator, gcd, trial;
24791                 const char * end_ptr;
24792                 const char * sign = "";
24793
24794                 /* We can't just find the numerator, denominator, and do the
24795                  * division, then use the method above, because that is
24796                  * inexact.  And the input could be a rational that is within
24797                  * epsilon (given our precision) of a valid rational, and would
24798                  * then incorrectly compare valid.
24799                  *
24800                  * We're only interested in the part after the '=' */
24801                 const char * this_lookup_name = lookup_name + equals_pos;
24802                 lookup_len -= equals_pos;
24803                 slash_pos -= equals_pos;
24804
24805                 /* Handle any leading minus */
24806                 if (this_lookup_name[0] == '-') {
24807                     sign = "-";
24808                     this_lookup_name++;
24809                     lookup_len--;
24810                     slash_pos--;
24811                 }
24812
24813                 /* Convert the numerator to numeric */
24814                 end_ptr = this_lookup_name + slash_pos;
24815                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24816                     goto failed;
24817                 }
24818
24819                 /* It better have included all characters before the slash */
24820                 if (*end_ptr != '/') {
24821                     goto failed;
24822                 }
24823
24824                 /* Set to look at just the denominator */
24825                 this_lookup_name += slash_pos;
24826                 lookup_len -= slash_pos;
24827                 end_ptr = this_lookup_name + lookup_len;
24828
24829                 /* Convert the denominator to numeric */
24830                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24831                     goto failed;
24832                 }
24833
24834                 /* It better be the rest of the characters, and don't divide by
24835                  * 0 */
24836                 if (   end_ptr != this_lookup_name + lookup_len
24837                     || denominator == 0)
24838                 {
24839                     goto failed;
24840                 }
24841
24842                 /* Get the greatest common denominator using
24843                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
24844                 gcd = numerator;
24845                 trial = denominator;
24846                 while (trial != 0) {
24847                     UV temp = trial;
24848                     trial = gcd % trial;
24849                     gcd = temp;
24850                 }
24851
24852                 /* If already in lowest possible terms, we have already tried
24853                  * looking this up */
24854                 if (gcd == 1) {
24855                     goto failed;
24856                 }
24857
24858                 /* Reduce the rational, which should put it in canonical form
24859                  * */
24860                 numerator /= gcd;
24861                 denominator /= gcd;
24862
24863                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24864                         equals_pos, lookup_name, sign, numerator, denominator);
24865             }
24866
24867             /* Here, we have the number in canonical form.  Try that */
24868             table_index = do_uniprop_match(canonical, strlen(canonical));
24869             if (table_index == 0) {
24870                 goto failed;
24871             }
24872         }   /* End of still didn't find the property in our table */
24873     }       /* End of       didn't find the property in our table */
24874
24875     /* Here, we have a non-zero return, which is an index into a table of ptrs.
24876      * A negative return signifies that the real index is the absolute value,
24877      * but the result needs to be inverted */
24878     if (table_index < 0) {
24879         invert_return = TRUE;
24880         table_index = -table_index;
24881     }
24882
24883     /* Out-of band indices indicate a deprecated property.  The proper index is
24884      * modulo it with the table size.  And dividing by the table size yields
24885      * an offset into a table constructed by regen/mk_invlists.pl to contain
24886      * the corresponding warning message */
24887     if (table_index > MAX_UNI_KEYWORD_INDEX) {
24888         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24889         table_index %= MAX_UNI_KEYWORD_INDEX;
24890         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24891                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24892                 (int) name_len, name,
24893                 get_deprecated_property_msg(warning_offset));
24894     }
24895
24896     /* In a few properties, a different property is used under /i.  These are
24897      * unlikely to change, so are hard-coded here. */
24898     if (to_fold) {
24899         if (   table_index == UNI_XPOSIXUPPER
24900             || table_index == UNI_XPOSIXLOWER
24901             || table_index == UNI_TITLE)
24902         {
24903             table_index = UNI_CASED;
24904         }
24905         else if (   table_index == UNI_UPPERCASELETTER
24906                  || table_index == UNI_LOWERCASELETTER
24907 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
24908                  || table_index == UNI_TITLECASELETTER
24909 #  endif
24910         ) {
24911             table_index = UNI_CASEDLETTER;
24912         }
24913         else if (  table_index == UNI_POSIXUPPER
24914                 || table_index == UNI_POSIXLOWER)
24915         {
24916             table_index = UNI_POSIXALPHA;
24917         }
24918     }
24919
24920     /* Create and return the inversion list */
24921     prop_definition = get_prop_definition(table_index);
24922     sv_2mortal(prop_definition);
24923
24924     /* See if there is a private use override to add to this definition */
24925     {
24926         COPHH * hinthash = (IN_PERL_COMPILETIME)
24927                            ? CopHINTHASH_get(&PL_compiling)
24928                            : CopHINTHASH_get(PL_curcop);
24929         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24930
24931         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24932
24933             /* See if there is an element in the hints hash for this table */
24934             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24935             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24936
24937             if (pos) {
24938                 bool dummy;
24939                 SV * pu_definition;
24940                 SV * pu_invlist;
24941                 SV * expanded_prop_definition =
24942                             sv_2mortal(invlist_clone(prop_definition, NULL));
24943
24944                 /* If so, it's definition is the string from here to the next
24945                  * \a character.  And its format is the same as a user-defined
24946                  * property */
24947                 pos += SvCUR(pu_lookup);
24948                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24949                 pu_invlist = handle_user_defined_property(lookup_name,
24950                                                           lookup_len,
24951                                                           0, /* Not UTF-8 */
24952                                                           0, /* Not folded */
24953                                                           runtime,
24954                                                           deferrable,
24955                                                           pu_definition,
24956                                                           &dummy,
24957                                                           msg,
24958                                                           level);
24959                 if (TAINT_get) {
24960                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24961                     sv_catpvs(msg, "Insecure private-use override");
24962                     goto append_name_to_msg;
24963                 }
24964
24965                 /* For now, as a safety measure, make sure that it doesn't
24966                  * override non-private use code points */
24967                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24968
24969                 /* Add it to the list to be returned */
24970                 _invlist_union(prop_definition, pu_invlist,
24971                                &expanded_prop_definition);
24972                 prop_definition = expanded_prop_definition;
24973                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24974             }
24975         }
24976     }
24977
24978     if (invert_return) {
24979         _invlist_invert(prop_definition);
24980     }
24981     return prop_definition;
24982
24983   unknown_user_defined:
24984     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24985     sv_catpvs(msg, "Unknown user-defined property name");
24986     goto append_name_to_msg;
24987
24988   failed:
24989     if (non_pkg_begin != 0) {
24990         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24991         sv_catpvs(msg, "Illegal user-defined property name");
24992     }
24993     else {
24994         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24995         sv_catpvs(msg, "Can't find Unicode property definition");
24996     }
24997     /* FALLTHROUGH */
24998
24999   append_name_to_msg:
25000     {
25001         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
25002         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
25003
25004         sv_catpv(msg, prefix);
25005         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
25006         sv_catpv(msg, suffix);
25007     }
25008
25009     return NULL;
25010
25011   definition_deferred:
25012
25013     {
25014         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
25015
25016         /* Here it could yet to be defined, so defer evaluation of this until
25017          * its needed at runtime.  We need the fully qualified property name to
25018          * avoid ambiguity */
25019         if (! fq_name) {
25020             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25021                                                                 is_qualified);
25022         }
25023
25024         /* If it didn't come with a package, or the package is utf8::, this
25025          * actually could be an official Unicode property whose inclusion we
25026          * are deferring until runtime to make sure that it isn't overridden by
25027          * a user-defined property of the same name (which we haven't
25028          * encountered yet).  Add a marker to indicate this possibility, for
25029          * use at such time when we first need the definition during pattern
25030          * matching execution */
25031         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25032             sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25033         }
25034
25035         /* We also need a trailing newline */
25036         sv_catpvs(fq_name, "\n");
25037
25038         *user_defined_ptr = TRUE;
25039         return fq_name;
25040     }
25041 }
25042
25043 STATIC bool
25044 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25045                               const STRLEN wname_len, /* Its length */
25046                               SV ** prop_definition,
25047                               AV ** strings)
25048 {
25049     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25050      * any matches, adding them to prop_definition */
25051
25052     dSP;
25053
25054     CV * get_names_info;        /* entry to charnames.pm to get info we need */
25055     SV * names_string;          /* Contains all character names, except algo */
25056     SV * algorithmic_names;     /* Contains info about algorithmically
25057                                    generated character names */
25058     REGEXP * subpattern_re;     /* The user's pattern to match with */
25059     struct regexp * prog;       /* The compiled pattern */
25060     char * all_names_start;     /* lib/unicore/Name.pl string of every
25061                                    (non-algorithmic) character name */
25062     char * cur_pos;             /* We match, effectively using /gc; this is
25063                                    where we are now */
25064     bool found_matches = FALSE; /* Did any name match so far? */
25065     SV * empty;                 /* For matching zero length names */
25066     SV * must_sv;               /* Contains the substring, if any, that must be
25067                                    in a name for the subpattern to match */
25068     const char * must;          /* The PV of 'must' */
25069     STRLEN must_len;            /* And its length */
25070     SV * syllable_name = NULL;  /* For Hangul syllables */
25071     const char hangul_prefix[] = "HANGUL SYLLABLE ";
25072     const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25073
25074     /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25075      * syllable name, and these are immutable and guaranteed by the Unicode
25076      * standard to never be extended */
25077     const STRLEN syl_max_len = hangul_prefix_len + 7;
25078
25079     IV i;
25080
25081     PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25082
25083     /* Make sure _charnames is loaded.  (The parameters give context
25084      * for any errors generated */
25085     get_names_info = get_cv("_charnames::_get_names_info", 0);
25086     if (! get_names_info) {
25087         Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25088     }
25089
25090     /* Get the charnames data */
25091     PUSHSTACKi(PERLSI_REGCOMP);
25092     ENTER ;
25093     SAVETMPS;
25094     save_re_context();
25095
25096     PUSHMARK(SP) ;
25097     PUTBACK;
25098
25099     /* Special _charnames entry point that returns the info this routine
25100      * requires */
25101     call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25102
25103     SPAGAIN ;
25104
25105     /* Data structure for names which end in their very own code points */
25106     algorithmic_names = POPs;
25107     SvREFCNT_inc_simple_void_NN(algorithmic_names);
25108
25109     /* The lib/unicore/Name.pl string */
25110     names_string = POPs;
25111     SvREFCNT_inc_simple_void_NN(names_string);
25112
25113     PUTBACK ;
25114     FREETMPS ;
25115     LEAVE ;
25116     POPSTACK;
25117
25118     if (   ! SvROK(names_string)
25119         || ! SvROK(algorithmic_names))
25120     {   /* Perhaps should panic instead XXX */
25121         SvREFCNT_dec(names_string);
25122         SvREFCNT_dec(algorithmic_names);
25123         return FALSE;
25124     }
25125
25126     names_string = sv_2mortal(SvRV(names_string));
25127     all_names_start = SvPVX(names_string);
25128     cur_pos = all_names_start;
25129
25130     algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25131
25132     /* Compile the subpattern consisting of the name being looked for */
25133     subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25134
25135     must_sv = re_intuit_string(subpattern_re);
25136     if (must_sv) {
25137         /* regexec.c can free the re_intuit_string() return. GH #17734 */
25138         must_sv = sv_2mortal(newSVsv(must_sv));
25139         must = SvPV(must_sv, must_len);
25140     }
25141     else {
25142         must = "";
25143         must_len = 0;
25144     }
25145
25146     /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
25147      * This works because the NUL causes the function to return early, thus
25148      * showing that there are characters in it other than the acceptable ones,
25149      * which is our desired result.) */
25150
25151     prog = ReANY(subpattern_re);
25152
25153     /* If only nothing is matched, skip to where empty names are looked for */
25154     if (prog->maxlen == 0) {
25155         goto check_empty;
25156     }
25157
25158     /* And match against the string of all names /gc.  Don't even try if it
25159      * must match a character not found in any name. */
25160     if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25161     {
25162         while (execute_wildcard(subpattern_re,
25163                                 cur_pos,
25164                                 SvEND(names_string),
25165                                 all_names_start, 0,
25166                                 names_string,
25167                                 0))
25168         { /* Here, matched. */
25169
25170             /* Note the string entries look like
25171              *      00001\nSTART OF HEADING\n\n
25172              * so we could match anywhere in that string.  We have to rule out
25173              * matching a code point line */
25174             char * this_name_start = all_names_start
25175                                                 + RX_OFFS(subpattern_re)->start;
25176             char * this_name_end   = all_names_start
25177                                                 + RX_OFFS(subpattern_re)->end;
25178             char * cp_start;
25179             char * cp_end;
25180             UV cp = 0;      /* Silences some compilers */
25181             AV * this_string = NULL;
25182             bool is_multi = FALSE;
25183
25184             /* If matched nothing, advance to next possible match */
25185             if (this_name_start == this_name_end) {
25186                 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25187                                           SvEND(names_string) - this_name_end);
25188                 if (cur_pos == NULL) {
25189                     break;
25190                 }
25191             }
25192             else {
25193                 /* Position the next match to start beyond the current returned
25194                  * entry */
25195                 cur_pos = (char *) memchr(this_name_end, '\n',
25196                                           SvEND(names_string) - this_name_end);
25197             }
25198
25199             /* Back up to the \n just before the beginning of the character. */
25200             cp_end = (char *) my_memrchr(all_names_start,
25201                                          '\n',
25202                                          this_name_start - all_names_start);
25203
25204             /* If we didn't find a \n, it means it matched somewhere in the
25205              * initial '00000' in the string, so isn't a real match */
25206             if (cp_end == NULL) {
25207                 continue;
25208             }
25209
25210             this_name_start = cp_end + 1;   /* The name starts just after */
25211             cp_end--;                       /* the \n, and the code point */
25212                                             /* ends just before it */
25213
25214             /* All code points are 5 digits long */
25215             cp_start = cp_end - 4;
25216
25217             /* This shouldn't happen, as we found a \n, and the first \n is
25218              * further along than what we subtracted */
25219             assert(cp_start >= all_names_start);
25220
25221             if (cp_start == all_names_start) {
25222                 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25223                 continue;
25224             }
25225
25226             /* If the character is a blank, we either have a named sequence, or
25227              * something is wrong */
25228             if (*(cp_start - 1) == ' ') {
25229                 cp_start = (char *) my_memrchr(all_names_start,
25230                                                '\n',
25231                                                cp_start - all_names_start);
25232                 cp_start++;
25233             }
25234
25235             assert(cp_start != NULL && cp_start >= all_names_start + 2);
25236
25237             /* Except for the first line in the string, the sequence before the
25238              * code point is \n\n.  If that isn't the case here, we didn't
25239              * match the name of a character.  (We could have matched a named
25240              * sequence, not currently handled */
25241             if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25242                 continue;
25243             }
25244
25245             /* We matched!  Add this to the list */
25246             found_matches = TRUE;
25247
25248             /* Loop through all the code points in the sequence */
25249             while (cp_start < cp_end) {
25250
25251                 /* Calculate this code point from its 5 digits */
25252                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25253                    + (XDIGIT_VALUE(cp_start[1]) << 12)
25254                    + (XDIGIT_VALUE(cp_start[2]) << 8)
25255                    + (XDIGIT_VALUE(cp_start[3]) << 4)
25256                    +  XDIGIT_VALUE(cp_start[4]);
25257
25258                 cp_start += 6;  /* Go past any blank */
25259
25260                 if (cp_start < cp_end || is_multi) {
25261                     if (this_string == NULL) {
25262                         this_string = newAV();
25263                     }
25264
25265                     is_multi = TRUE;
25266                     av_push(this_string, newSVuv(cp));
25267                 }
25268             }
25269
25270             if (is_multi) { /* Was more than one code point */
25271                 if (*strings == NULL) {
25272                     *strings = newAV();
25273                 }
25274
25275                 av_push(*strings, (SV *) this_string);
25276             }
25277             else {  /* Only a single code point */
25278                 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25279             }
25280         } /* End of loop through the non-algorithmic names string */
25281     }
25282
25283     /* There are also character names not in 'names_string'.  These are
25284      * algorithmically generatable.  Try this pattern on each possible one.
25285      * (khw originally planned to leave this out given the large number of
25286      * matches attempted; but the speed turned out to be quite acceptable
25287      *
25288      * There are plenty of opportunities to optimize to skip many of the tests.
25289      * beyond the rudimentary ones already here */
25290
25291     /* First see if the subpattern matches any of the algorithmic generatable
25292      * Hangul syllable names.
25293      *
25294      * We know none of these syllable names will match if the input pattern
25295      * requires more bytes than any syllable has, or if the input pattern only
25296      * matches an empty name, or if the pattern has something it must match and
25297      * one of the characters in that isn't in any Hangul syllable. */
25298     if (    prog->minlen <= (SSize_t) syl_max_len
25299         &&  prog->maxlen > 0
25300         && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25301     {
25302         /* These constants, names, values, and algorithm are adapted from the
25303          * Unicode standard, version 5.1, section 3.12, and should never
25304          * change. */
25305         const char * JamoL[] = {
25306             "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25307             "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25308         };
25309         const int LCount = C_ARRAY_LENGTH(JamoL);
25310
25311         const char * JamoV[] = {
25312             "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25313             "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25314             "I"
25315         };
25316         const int VCount = C_ARRAY_LENGTH(JamoV);
25317
25318         const char * JamoT[] = {
25319             "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25320             "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25321             "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25322         };
25323         const int TCount = C_ARRAY_LENGTH(JamoT);
25324
25325         int L, V, T;
25326
25327         /* This is the initial Hangul syllable code point; each time through the
25328          * inner loop, it maps to the next higher code point.  For more info,
25329          * see the Hangul syllable section of the Unicode standard. */
25330         int cp = 0xAC00;
25331
25332         syllable_name = sv_2mortal(newSV(syl_max_len));
25333         sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25334
25335         for (L = 0; L < LCount; L++) {
25336             for (V = 0; V < VCount; V++) {
25337                 for (T = 0; T < TCount; T++) {
25338
25339                     /* Truncate back to the prefix, which is unvarying */
25340                     SvCUR_set(syllable_name, hangul_prefix_len);
25341
25342                     sv_catpv(syllable_name, JamoL[L]);
25343                     sv_catpv(syllable_name, JamoV[V]);
25344                     sv_catpv(syllable_name, JamoT[T]);
25345
25346                     if (execute_wildcard(subpattern_re,
25347                                 SvPVX(syllable_name),
25348                                 SvEND(syllable_name),
25349                                 SvPVX(syllable_name), 0,
25350                                 syllable_name,
25351                                 0))
25352                     {
25353                         *prop_definition = add_cp_to_invlist(*prop_definition,
25354                                                              cp);
25355                         found_matches = TRUE;
25356                     }
25357
25358                     cp++;
25359                 }
25360             }
25361         }
25362     }
25363
25364     /* The rest of the algorithmically generatable names are of the form
25365      * "PREFIX-code_point".  The prefixes and the code point limits of each
25366      * were returned to us in the array 'algorithmic_names' from data in
25367      * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
25368     for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25369         IV j;
25370
25371         /* Each element of the array is a hash, giving the details for the
25372          * series of names it covers.  There is the base name of the characters
25373          * in the series, and the low and high code points in the series.  And,
25374          * for optimization purposes a string containing all the legal
25375          * characters that could possibly be in a name in this series. */
25376         HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25377         SV * prefix = * hv_fetchs(this_series, "name", 0);
25378         IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25379         IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25380         char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25381
25382         /* Pre-allocate an SV with enough space */
25383         SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25384                                                         SvPVX(prefix)));
25385         if (high >= 0x10000) {
25386             sv_catpvs(algo_name, "0");
25387         }
25388
25389         /* This series can be skipped entirely if the pattern requires
25390          * something longer than any name in the series, or can only match an
25391          * empty name, or contains a character not found in any name in the
25392          * series */
25393         if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
25394             &&  prog->maxlen > 0
25395             && (strspn(must, legal) == must_len))
25396         {
25397             for (j = low; j <= high; j++) { /* For each code point in the series */
25398
25399                 /* Get its name, and see if it matches the subpattern */
25400                 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25401                                      (unsigned) j);
25402
25403                 if (execute_wildcard(subpattern_re,
25404                                     SvPVX(algo_name),
25405                                     SvEND(algo_name),
25406                                     SvPVX(algo_name), 0,
25407                                     algo_name,
25408                                     0))
25409                 {
25410                     *prop_definition = add_cp_to_invlist(*prop_definition, j);
25411                     found_matches = TRUE;
25412                 }
25413             }
25414         }
25415     }
25416
25417   check_empty:
25418     /* Finally, see if the subpattern matches an empty string */
25419     empty = newSVpvs("");
25420     if (execute_wildcard(subpattern_re,
25421                          SvPVX(empty),
25422                          SvEND(empty),
25423                          SvPVX(empty), 0,
25424                          empty,
25425                          0))
25426     {
25427         /* Many code points have empty names.  Currently these are the \p{GC=C}
25428          * ones, minus CC and CF */
25429
25430         SV * empty_names_ref = get_prop_definition(UNI_C);
25431         SV * empty_names = invlist_clone(empty_names_ref, NULL);
25432
25433         SV * subtract = get_prop_definition(UNI_CC);
25434
25435         _invlist_subtract(empty_names, subtract, &empty_names);
25436         SvREFCNT_dec_NN(empty_names_ref);
25437         SvREFCNT_dec_NN(subtract);
25438
25439         subtract = get_prop_definition(UNI_CF);
25440         _invlist_subtract(empty_names, subtract, &empty_names);
25441         SvREFCNT_dec_NN(subtract);
25442
25443         _invlist_union(*prop_definition, empty_names, prop_definition);
25444         found_matches = TRUE;
25445         SvREFCNT_dec_NN(empty_names);
25446     }
25447     SvREFCNT_dec_NN(empty);
25448
25449 #if 0
25450     /* If we ever were to accept aliases for, say private use names, we would
25451      * need to do something fancier to find empty names.  The code below works
25452      * (at the time it was written), and is slower than the above */
25453     const char empties_pat[] = "^.";
25454     if (strNE(name, empties_pat)) {
25455         SV * empty = newSVpvs("");
25456         if (execute_wildcard(subpattern_re,
25457                     SvPVX(empty),
25458                     SvEND(empty),
25459                     SvPVX(empty), 0,
25460                     empty,
25461                     0))
25462         {
25463             SV * empties = NULL;
25464
25465             (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25466
25467             _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25468             SvREFCNT_dec_NN(empties);
25469
25470             found_matches = TRUE;
25471         }
25472         SvREFCNT_dec_NN(empty);
25473     }
25474 #endif
25475
25476     SvREFCNT_dec_NN(subpattern_re);
25477     return found_matches;
25478 }
25479
25480 /*
25481  * ex: set ts=8 sts=4 sw=4 et:
25482  */